def main(): global msglogger script_dir = os.path.dirname(__file__) args = parse_args() # Distiller loggers msglogger = apputils.config_pylogger('logging.conf', args.name, output_dir=args.output_dir) tflogger = TensorBoardLogger(msglogger.logdir) # tflogger.log_gradients = True # pylogger = PythonLogger(msglogger) if args.seed is not None: msglogger.info("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) args.qe_mode = str(args.qe_mode).split('.')[1] args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1] apputils.log_execution_env_state(sys.argv) if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) if len(args.gpus) > 1: msglogger.error('ERROR: Only single GPU supported for NCF') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = msglogger.logdir msglogger.info("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data training = not (args.eval or args.qe_calibration or args.activation_histograms) msglogger.info('Loading data') if training: train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items else: train_dataset = None train_dataloader = None nb_users, nb_items = (138493, 26744) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) msglogger.info( 'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' % (time.time() - t1, nb_users, nb_items, str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], split_final=args.split_final) if use_cuda: model = model.cuda() msglogger.info(model) msglogger.info("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) compression_scheduler = None start_epoch = 0 optimizer = None if args.load: if training: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.load) if args.reset_optimizer: start_epoch = 0 optimizer = None else: model = apputils.load_lean_checkpoint(model, args.load) # Add loss to graph criterion = nn.BCEWithLogitsLoss() if use_cuda: criterion = criterion.cuda() if training and optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.compress: compression_scheduler = distiller.file_config(model, optimizer, args.compress) model.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') if args.qe_calibration or args.activation_histograms: calib = { 'portion': args.qe_calibration, 'desc_str': 'quantization calibration stats', 'collect_func': partial(distiller.data_loggers.collect_quant_stats, inplace_runtime_check=True, disable_inplace_attrs=True) } hists = { 'portion': args.activation_histograms, 'desc_str': 'activation histograms', 'collect_func': partial(distiller.data_loggers.collect_histograms, activation_stats=None, nbins=2048, save_hist_imgs=True) } d = calib if args.qe_calibration else hists distiller.utils.assign_layer_fq_names(model) num_users = int(np.floor(len(test_ratings) * d['portion'])) msglogger.info( "Generating {} based on {:.1%} of the test-set ({} users)".format( d['desc_str'], d['portion'], num_users)) test_fn = partial(val_epoch, ratings=test_ratings, negs=test_negs, K=args.topk, use_cuda=use_cuda, processes=args.processes, num_users=num_users) d['collect_func'](model=model, test_fn=test_fn, save_dir=run_dir, classes=None) return 0 if args.eval: if args.quantize_eval and args.qe_calibration is None: model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( model, args) dummy_input = (torch.tensor([1]), torch.tensor([1]), torch.tensor([True], dtype=torch.bool)) quantizer.prepare_model(dummy_input) model.cuda() distiller.utils.assign_layer_fq_names(model) if args.eval_fp16: model = model.half() # Calculate initial Hit Ratio and NDCG begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) val_time = time.time() - begin hit_rate = np.mean(hits) msglogger.info( 'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}' .format(K=args.topk, hit_rate=hit_rate, ndcg=np.mean(ndcgs), val_time=val_time)) hit_rate = 0 if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, 'NCF', model, optimizer=None, extras={'quantized_hr@10': hit_rate}, name='_'.join([args.name, 'quantized']) if args.name else checkpoint_name, dir=msglogger.logdir) return 0 total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / args.batch_size) best_hit_rate = 0 best_epoch = 0 for epoch in range(start_epoch, args.epochs): msglogger.info('') model.train() losses = utils.AverageMeter() begin = time.time() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, optimizer) loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, batch_index, steps_per_epoch, optimizer) outputs = model(user, item, torch.tensor([False], dtype=torch.bool)) loss = criterion(outputs, label) if compression_scheduler: compression_scheduler.before_backward_pass( epoch, batch_index, steps_per_epoch, loss, optimizer, return_loss_components=False) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, batch_index, steps_per_epoch, optimizer) # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) steps_completed = batch_index + 1 if steps_completed % args.log_freq == 0: stats_dict = OrderedDict() stats_dict['Loss'] = losses.avg stats = ('Performance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.log_freq, [tflogger]) tflogger.log_model_buffers(model, ['tracked_min', 'tracked_max'], 'Quant/Train/Acts/TrackedMinMax', epoch, steps_completed, steps_per_epoch, args.log_freq) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) hit_rate = np.mean(hits) mean_ndcgs = np.mean(ndcgs) stats_dict = OrderedDict() stats_dict['HR@{0}'.format(args.topk)] = hit_rate stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) msglogger.info( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, ' 'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hit_rate, ndcg=mean_ndcgs, loss=losses, train_time=train_time, val_time=val_time)) is_best = False if hit_rate > best_hit_rate: best_hit_rate = hit_rate is_best = True best_epoch = epoch extras = { 'current_hr@10': hit_rate, 'best_hr@10': best_hit_rate, 'best_epoch': best_epoch } apputils.save_checkpoint(epoch, 'NCF', model, optimizer, compression_scheduler, extras, is_best, dir=run_dir) if args.threshold is not None: if np.mean(hits) >= args.threshold: msglogger.info("Hit threshold of {}".format(args.threshold)) break
def main(): from grace_dl.dist.helper import timer, volume, tensor_bits args = parse_args() init_distributed(args) if args.weak_scaling: args.batch_size *= args.world_size init_wandb(args) init_grace(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() if args.seed is not None: torch.manual_seed(args.seed) train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:0')) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:0')) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:0')) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') # if args.distributed: # model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) if args.local_rank == 0: print(model) print("{} parameters".format(utils.count_parameters(model))) # [print(parameter) for parameter in model.parameters()] if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] # broadcast model states from rank0 to other nodes !!! This is important! [torch.distributed.broadcast(p.data, src=0) for p in model.parameters()] # if args.local_rank == 0: # save_initial_state_path = os.path.join(args.checkpoint_dir, 'model_init.pth') # print("Saving the model to: ", save_initial_state_path) # torch.save(model.state_dict(), save_initial_state_path) for epoch in range(args.epochs): begin = time.time() train_time = 0 epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): batch_start = time.time() for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # check grad sparsity if args.sparsity_check: total_nonzero = 0 total_numel = 0 for index, (name, p) in enumerate(model.named_parameters()): sparsity = 1.0 - torch.sum( p.grad.data.abs() > 0).float() / p.grad.data.numel() total_nonzero += torch.sum(p.grad.data.abs() > 0).float() total_numel += p.grad.data.numel() if args.local_rank == 0: wandb.log( { f"{name}(sparsity)(numel={p.grad.data.numel()})": sparsity, }, commit=False) if args.local_rank == 0: wandb.log( { f"total_sparsity(numel={total_numel})": 1 - total_nonzero / total_numel, }, commit=True) # add grace just before optimizer.step() torch.cuda.synchronize() comm_start = time.time() for index, (name, p) in enumerate(model.named_parameters()): new_grad = args.grc.step(p.grad.data, name) p.grad.data = new_grad torch.cuda.synchronize() timer['comm'] = time.time() - comm_start # [torch.distributed.all_reduce(p.grad.data) for p in model.parameters()] # for param in model.parameters(): # dist.all_reduce(param.grad.data) # param.grad.data /= float(args.world_size) optimizer.step() for p in model.parameters(): p.grad = None if args.throughput: torch.cuda.synchronize() if args.log_time and args.local_rank == 0: timer['batch_time'] = time.time() - batch_start timer['computation'] = timer['batch_time'] - timer['comm'] print("Timer:", timer, '\n') timer['en/decoding'] = 0 timer['batch_time'] = 0 timer['computation'] = 0 timer['comm'] = 0 if args.log_volume and args.local_rank == 0: ratio = volume['compress'] / volume['nocompress'] volume['ratio_acc'].append(ratio) avg_ratio = sum(volume['ratio_acc']) / len(volume['ratio_acc']) print( f"Data volume:: compress {volume['compress']} no_compress {volume['nocompress']} ratio {ratio:.4f} avg_ratio {avg_ratio:.4f}" ) volume['compress'] = 0 volume['nocompress'] = 0 batch_throughput = args.batch_size / (time.time() - batch_start ) # global throughput train_time += time.time() - batch_start if (args.throughput or args.eval_at_every_batch) and args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.throughput and i == 3: break if args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.eval_at_every_batch: hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) if args.local_rank == 0: wandb.log({ "eval/hr@10": hr, }) del epoch_users, epoch_items, epoch_label # train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time if args.throughput: train_throughput = batch_throughput train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if args.local_rank == 0: wandb.log( { "train_epoch_time": train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput, 'train_throughput': train_throughput, }, commit=False) if not args.eval_at_every_batch: wandb.log({ "eval/hr@10": hr, }, commit=False) wandb.log({"epoch": epoch}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.throughput: break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple()) wandb.log({ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time })