def main(): log_hardware() args = parse_args() log_args(args) model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.opt_level == "O2": model = amp.initialize(model, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() predictions = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) LOGGER.log(key='batch_size', value=args.batch_size) LOGGER.log(key='best_inference_throughput', value=args.batch_size / min(latencies)) LOGGER.log(key='best_inference_latency', value=min(latencies)) LOGGER.log(key='inference_latencies', value=latencies) return
def main(): args = parse_args() dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.opt_level == "O2": model = amp.initialize(model, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') model.eval() users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() predictions = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) dllogger.log(data={ 'batch_size': args.batch_size, 'best_inference_throughput': args.batch_size / min(latencies), 'best_inference_latency': min(latencies), 'mean_inference_throughput': args.batch_size / np.mean(latencies), 'mean_inference_latency': np.mean(latencies), 'inference_latencies': latencies }, step=tuple()) dllogger.flush() return
def main(): args = parse_args() dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.fp16: model.half() model.eval() batch_sizes = args.batch_sizes.split(',') batch_sizes = [int(s) for s in batch_sizes] result_data = {} for batch_size in batch_sizes: print('benchmarking batch size: ', batch_size) users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() _ = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies) result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies) result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90) result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95) result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99) dllogger.log(data=result_data, step=tuple()) dllogger.flush() return
def main(): # Note: The run start is in convert.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.TRAIN_LEARN_RATE, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Check where to put data loader if use_cuda: dataloader_device = 'cpu' if args.cpu_dataloader else 'cuda' else: dataloader_device = 'cpu' # more like load trigger timmer now mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. torch.cuda.synchronize() #=========================================================================== #== The clock starts on loading the preprocessed data. ===================== #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) run_start_time = time.time() print(datetime.now(), "Loading test ratings.") test_ratings = [torch.LongTensor()] * args.user_scaling for chunk in range(args.user_scaling): test_ratings[chunk] = torch.from_numpy( np.load(args.data + '/testx' + str(args.user_scaling) + 'x' + str(args.item_scaling) + '_' + str(chunk) + '.npz', encoding='bytes')['arr_0']) fn_prefix = args.data + '/' + CACHE_FN.format(args.user_scaling, args.item_scaling) sampler_cache = fn_prefix + "cached_sampler.pkl" print(datetime.now(), "Loading preprocessed sampler.") if os.path.exists(args.data): print("Using alias file: {}".format(args.data)) with open(sampler_cache, "rb") as f: sampler, pos_users, pos_items, nb_items, _ = pickle.load(f) print(datetime.now(), "Alias table loaded.") nb_users = len(sampler.num_regions) train_users = torch.from_numpy(pos_users).type(torch.LongTensor) train_items = torch.from_numpy(pos_items).type(torch.LongTensor) mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label, neg_label)) del neg_label test_pos = [l[:, 1].reshape(-1, 1) for l in test_ratings] test_negatives = [torch.LongTensor()] * args.user_scaling test_neg_items = [torch.LongTensor()] * args.user_scaling print(datetime.now(), "Loading test negatives.") for chunk in range(args.user_scaling): file_name = (args.data + '/test_negx' + str(args.user_scaling) + 'x' + str(args.item_scaling) + '_' + str(chunk) + '.npz') raw_data = np.load(file_name, encoding='bytes') test_negatives[chunk] = torch.from_numpy(raw_data['arr_0']) print( datetime.now(), "Test negative chunk {} of {} loaded ({} users).".format( chunk + 1, args.user_scaling, test_negatives[chunk].size())) test_neg_items = [l[:, 1] for l in test_negatives] # create items with real sample at last position test_items = [ torch.cat((a.reshape(-1, args.valid_negative), b), dim=1) for a, b in zip(test_neg_items, test_pos) ] del test_ratings, test_neg_items # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = zip(*[torch.sort(l) for l in test_items]) # [1,1,1,2], [3,1,0,2] sum_item_indices = [ a.float() + b.float() / len(b[0]) for a, b in zip(sorted_items, indices) ] #[1.75,1.25,1.0,2.5] indices_order = [torch.sort(l)[1] for l in sum_item_indices] #[2,1,0,3] stable_indices = [ torch.gather(a, 1, b) for a, b in zip(indices, indices_order) ] #[0,1,3,2] # produce -1 mask dup_mask = [(l[:, 0:-1] == l[:, 1:]) for l in sorted_items] dup_mask = [ torch.cat((torch.zeros_like(a, dtype=torch.uint8), b), dim=1) for a, b in zip(test_pos, dup_mask) ] dup_mask = [ torch.gather(a, 1, b.sort()[1]) for a, b in zip(dup_mask, stable_indices) ] # produce real sample indices to later check in topk sorted_items, indices = zip(*[(a != b).sort() for a, b in zip(test_items, test_pos)]) sum_item_indices = [(a.float()) + (b.float()) / len(b[0]) for a, b in zip(sorted_items, indices)] indices_order = [torch.sort(l)[1] for l in sum_item_indices] stable_indices = [ torch.gather(a, 1, b) for a, b in zip(indices, indices_order) ] real_indices = [l[:, 0] for l in stable_indices] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos # For our dataset, test set is identical to user set, so arange() provides # all test users. test_users = torch.arange(nb_users, dtype=torch.long) test_users = test_users[:, None] test_users = test_users + torch.zeros(1 + args.valid_negative, dtype=torch.long) # test_items needs to be of type Long in order to be used in embedding test_items = torch.cat(test_items).type(torch.long) dup_mask = torch.cat(dup_mask) real_indices = torch.cat(real_indices) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # we shuffled later with randperm print( datetime.now(), "Data loading done {:.1f} sec. #user={}, #item={}, #train={}, #test={}" .format(time.time() - run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph params = model.parameters() optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() local_batch = args.batch_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG samples_per_user = test_items.size(1) users_per_valid_batch = args.valid_batch_size // samples_per_user test_users = test_users.split(users_per_valid_batch) test_items = test_items.split(users_per_valid_batch) dup_mask = dup_mask.split(users_per_valid_batch) real_indices = real_indices.split(users_per_valid_batch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=samples_per_user, num_user=nb_users) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=hr, ndcg=ndcg)) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() st = timeit.default_timer() if args.random_negatives: neg_users = train_users.repeat(args.negative_samples) neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_( 0, nb_items) else: negatives = generate_negatives(sampler, args.negative_samples, train_users.numpy()) negatives = torch.from_numpy(negatives) neg_users = negatives[:, 0] neg_items = negatives[:, 1] print("generate_negatives loop time: {:.2f}", timeit.default_timer() - st) after_neg_gen = time.time() st = timeit.default_timer() epoch_users = torch.cat((train_users, neg_users)) epoch_items = torch.cat((train_items, neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(len(epoch_users), device=dataloader_device) epoch_size = len(epoch_indices) epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) print("shuffle time: {:.2f}", timeit.default_timer() - st) # only print progress bar on rank 0 num_batches = (epoch_size + args.batch_size - 1) // args.batch_size qbar = tqdm.tqdm(range(num_batches)) # handle extremely rare case where last batch size < number of worker if len(epoch_users_list) < num_batches: print("epoch_size % batch_size < number of worker!") exit(1) after_shuffle = time.time() neg_gen_time = (after_neg_gen - begin) shuffle_time = (after_shuffle - after_neg_gen) for i in qbar: # selecting input from prepared data user = epoch_users_list[i].cuda() item = epoch_items_list[i].cuda() label = epoch_label_list[i].view(-1, 1).cuda() for p in model.parameters(): p.grad = None outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) loss.backward() optimizer.step() del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() mlperf_log.ncf_print(key=mlperf_log.EVAL_START, value=epoch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=samples_per_user, num_user=nb_users, output=valid_results_file, epoch=epoch, loss=loss.data.item()) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}, loss = {loss:.4f},' ' neg_gen: {neg_gen_time:.4f}, shuffle_time: {shuffle_time:.2f}'. format(epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time, loss=loss.data.item(), neg_gen_time=neg_gen_time, shuffle_time=shuffle_time)) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": hr }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP, value=epoch) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) run_stop_time = time.time() mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL) # easy way of tracking mlperf score if success: print("mlperf_score", run_stop_time - run_start_time)
def main(): log_hardware() args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) log_args(args) if args.seed is not None: torch.manual_seed(args.seed) print("Saving results to {}".format(args.checkpoint_dir)) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) # The default of np.random.choice is replace=True, so does pytorch random_() LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() LOGGER.log(key=tags.RUN_START) train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) valid_negative = test_negs.shape[1] LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=valid_negative) nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings)) all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size) LOGGER.log(key=tags.INPUT_ORDER) # we shuffled later with randperm # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.opt_level == "O2": model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(admm_utils.count_parameters(model))) LOGGER.log(key=tags.OPT_LR, value=args.learning_rate) LOGGER.log(key=tags.OPT_NAME, value="Adam") LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1) LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2) LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps) LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': LOGGER.log(key=tags.EVAL_START, value=0) start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=hr, ndcg=ndcg)) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": 0, "value": hr}) LOGGER.log(key=tags.EVAL_STOP, value=0) LOGGER.log(key='best_eval_throughput', value=eval_throughput) return success = False max_hr = 0 train_throughputs, eval_throughputs = [], [] LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(args.epochs): LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples) LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.opt_level == "O2": with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) LOGGER.log(key='train_throughput', value=train_throughput) LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) LOGGER.log(key=tags.EVAL_START, value=epoch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold) LOGGER.log(key=tags.EVAL_STOP, value=epoch) eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) LOGGER.log(key='eval_throughput', value=eval_throughput) if hr > max_hr and args.local_rank == 0: max_hr = hr save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("New best hr! Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break if args.local_rank == 0: LOGGER.log(key='best_train_throughput', value=max(train_throughputs)) LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs)) LOGGER.log(key='best_accuracy', value=max_hr) LOGGER.log(key='time_to_target', value=time.time() - main_start_time) LOGGER.log(key='time_to_best_model', value=best_model_timestamp - main_start_time) LOGGER.log(key=tags.RUN_STOP, value={"success": success}) LOGGER.log(key=tags.RUN_FINAL)
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) for epoch in range(args.epochs): model.train() losses = utils.AverageMeter() begin = time.time() loader = tqdm.tqdm(train_dataloader) length = len(loader) if length < 101: print( 'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.' ) cuda.profile_stop() exit() for batch_index, (user, item, label) in enumerate(loader): if batch_index == length // 2 and epoch == 0: print('Starting profiling for 100 iterations.') cuda.profile_start() if batch_index == length // 2 + 100 and epoch == 0: print( 'Profiling completed, stopping profiling and continuing training.' ) cuda.profile_stop() user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) return 0
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data # TODO: Reading CSVs is slow. Could use HDF or Apache Arrow train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) print('batchsize=%d' % args.batch_size) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) for epoch in range(args.epochs): model.train() losses = utils.AverageMeter() begin = time.time() loader = tqdm.tqdm(train_dataloader) counting_data = 0 counting_forward = 0 counting_zerograd = 0 counting_backward = 0 counting_updateweight = 0 counting_des = 0 for batch_index, (user, item, label) in enumerate(loader): start0 = time.time() user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) start1 = time.time() outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) start2 = time.time() optimizer.zero_grad() start3 = time.time() loss.backward() start4 = time.time() optimizer.step() start5 = time.time() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) start6 = time.time() counting_data += start1 - start0 counting_forward += start2 - start1 counting_zerograd += start3 - start2 counting_backward += start4 - start3 counting_updateweight += start5 - start3 counting_des += start6 - start5 train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch) val_time = time.time() - begin print( 'data: {data:.f4}, forward: {ft:.4f}, zerograd: {zg:.4f}, backward: {bw:.4f},' ' adam: {adam:.4f}, description: {des:.4f}'.format( data=counting_data, ft=counting_forward, zg=counting_zerograd, bw=counting_backward, adam=counting_updateweight, des=counting_des)) print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) return 0
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Create model model = NeuMF(2197225, 855776, mf_dim=64, mf_reg=0., mlp_layer_sizes=[256, 256, 128, 64], mlp_layer_regs=[0. for i in [256, 256, 128, 64]]) print(model) if use_cuda: # Move model and loss to GPU model = model.cuda() if args.load_ckp: ckp = torch.load(args.load_ckp) model.load_state_dict(ckp) if args.quantize: all_embeding = [ n for n, m in model.named_modules() if isinstance(m, nn.Embedding) ] all_linear = [ n for n, m in model.named_modules() if isinstance(m, nn.Linear) ] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] # layers = all_relu + all_relu6 + all_linear layers = all_embeding replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Linear: ParameterModuleWrapperPost, nn.Embedding: ActivationModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) # mq.log_quantizer_state(ml_logger, -1) test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader( args.data) data = NcfData(test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user) hr, ndcg = val(model, data) print('') print('') print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(K=K, hit_rate=hr, ndcg=ndcg))
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
def main(): global msglogger script_dir = os.path.dirname(__file__) args = parse_args() # Distiller loggers msglogger = apputils.config_pylogger('logging.conf', args.name, output_dir=args.output_dir) tflogger = TensorBoardLogger(msglogger.logdir) # tflogger.log_gradients = True # pylogger = PythonLogger(msglogger) if args.seed is not None: msglogger.info("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) args.qe_mode = str(args.qe_mode).split('.')[1] args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1] apputils.log_execution_env_state(sys.argv) if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) if len(args.gpus) > 1: msglogger.error('ERROR: Only single GPU supported for NCF') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = msglogger.logdir msglogger.info("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data training = not (args.eval or args.qe_calibration or args.activation_histograms) msglogger.info('Loading data') if training: train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items else: train_dataset = None train_dataloader = None nb_users, nb_items = (138493, 26744) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) msglogger.info( 'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' % (time.time() - t1, nb_users, nb_items, str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], split_final=args.split_final) if use_cuda: model = model.cuda() msglogger.info(model) msglogger.info("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) compression_scheduler = None start_epoch = 0 optimizer = None if args.load: if training: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.load) if args.reset_optimizer: start_epoch = 0 optimizer = None else: model = apputils.load_lean_checkpoint(model, args.load) # Add loss to graph criterion = nn.BCEWithLogitsLoss() if use_cuda: criterion = criterion.cuda() if training and optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.compress: compression_scheduler = distiller.file_config(model, optimizer, args.compress) model.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') if args.qe_calibration or args.activation_histograms: calib = { 'portion': args.qe_calibration, 'desc_str': 'quantization calibration stats', 'collect_func': partial(distiller.data_loggers.collect_quant_stats, inplace_runtime_check=True, disable_inplace_attrs=True) } hists = { 'portion': args.activation_histograms, 'desc_str': 'activation histograms', 'collect_func': partial(distiller.data_loggers.collect_histograms, activation_stats=None, nbins=2048, save_hist_imgs=True) } d = calib if args.qe_calibration else hists distiller.utils.assign_layer_fq_names(model) num_users = int(np.floor(len(test_ratings) * d['portion'])) msglogger.info( "Generating {} based on {:.1%} of the test-set ({} users)".format( d['desc_str'], d['portion'], num_users)) test_fn = partial(val_epoch, ratings=test_ratings, negs=test_negs, K=args.topk, use_cuda=use_cuda, processes=args.processes, num_users=num_users) d['collect_func'](model=model, test_fn=test_fn, save_dir=run_dir, classes=None) return 0 if args.eval: if args.quantize_eval and args.qe_calibration is None: model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( model, args) dummy_input = (torch.tensor([1]), torch.tensor([1]), torch.tensor([True], dtype=torch.bool)) quantizer.prepare_model(dummy_input) model.cuda() distiller.utils.assign_layer_fq_names(model) if args.eval_fp16: model = model.half() # Calculate initial Hit Ratio and NDCG begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) val_time = time.time() - begin hit_rate = np.mean(hits) msglogger.info( 'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}' .format(K=args.topk, hit_rate=hit_rate, ndcg=np.mean(ndcgs), val_time=val_time)) hit_rate = 0 if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, 'NCF', model, optimizer=None, extras={'quantized_hr@10': hit_rate}, name='_'.join([args.name, 'quantized']) if args.name else checkpoint_name, dir=msglogger.logdir) return 0 total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / args.batch_size) best_hit_rate = 0 best_epoch = 0 for epoch in range(start_epoch, args.epochs): msglogger.info('') model.train() losses = utils.AverageMeter() begin = time.time() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, optimizer) loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, batch_index, steps_per_epoch, optimizer) outputs = model(user, item, torch.tensor([False], dtype=torch.bool)) loss = criterion(outputs, label) if compression_scheduler: compression_scheduler.before_backward_pass( epoch, batch_index, steps_per_epoch, loss, optimizer, return_loss_components=False) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, batch_index, steps_per_epoch, optimizer) # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) steps_completed = batch_index + 1 if steps_completed % args.log_freq == 0: stats_dict = OrderedDict() stats_dict['Loss'] = losses.avg stats = ('Performance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.log_freq, [tflogger]) tflogger.log_model_buffers(model, ['tracked_min', 'tracked_max'], 'Quant/Train/Acts/TrackedMinMax', epoch, steps_completed, steps_per_epoch, args.log_freq) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) hit_rate = np.mean(hits) mean_ndcgs = np.mean(ndcgs) stats_dict = OrderedDict() stats_dict['HR@{0}'.format(args.topk)] = hit_rate stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) msglogger.info( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, ' 'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hit_rate, ndcg=mean_ndcgs, loss=losses, train_time=train_time, val_time=val_time)) is_best = False if hit_rate > best_hit_rate: best_hit_rate = hit_rate is_best = True best_epoch = epoch extras = { 'current_hr@10': hit_rate, 'best_hr@10': best_hit_rate, 'best_epoch': best_epoch } apputils.save_checkpoint(epoch, 'NCF', model, optimizer, compression_scheduler, extras, is_best, dir=run_dir) if args.threshold is not None: if np.mean(hits) >= args.threshold: msglogger.info("Hit threshold of {}".format(args.threshold)) break
def main(): args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}.{}".format(config['timestamp'],args.local_rank) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # more like load trigger timmer now mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() mlperf_log.ncf_print(key=mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() #=========================================================================== #== The clock starts on loading the preprocessed data. ===================== #=========================================================================== mlperf_log.ncf_print(key=mlperf_log.RUN_START) run_start_time = time.time() # load not converted data, just seperate one for test train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) # get input data # get dims nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item()+1 nb_items = nb_maxs[1].item()+1 train_users = train_ratings[:,0] train_items = train_ratings[:,1] del nb_maxs, train_ratings mlperf_log.ncf_print(key=mlperf_log.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1) mat[train_users, train_items] = 0 # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label,neg_label)) del neg_label if args.fp16: train_label = train_label.half() # produce validation negative sample on GPU all_test_users = test_ratings.shape[0] test_users = test_ratings[:,0] test_pos = test_ratings[:,1].reshape(-1,1) test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1] # create items with real sample at last position test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative) test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1) del test_ratings, test_negs # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2] sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5] indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3] stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2] # produce -1 mask dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:]) dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1) dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1]) # produce real sample indices to later check in topk sorted_items, indices = (test_items != test_pos).sort() sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) indices_order = torch.sort(sum_item_indices)[1] stable_indices = torch.gather(indices, 1, indices_order) real_indices = stable_indices[:,0] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos if args.distributed: test_users = torch.chunk(test_users, args.world_size)[args.local_rank] test_items = torch.chunk(test_items, args.world_size)[args.local_rank] dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank] real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank] # make pytorch memory behavior more consistent later torch.cuda.empty_cache() mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print(key=mlperf_log.INPUT_ORDER) # we shuffled later with randperm print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time()-run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) if args.fp16: model = model.half() print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph if args.fp16: fp_optimizer = Fp16Optimizer(model, args.loss_scale) params = fp_optimizer.fp32_params else: params = model.parameters() #optimizer = torch.optim.Adam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) # optimizer = AdamOpt(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False) criterion = nn.BCEWithLogitsLoss(reduction = 'none') # use torch.mean() with dim later to avoid copy to host mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=args.beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=args.beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.eps) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.distributed: model = DDP(model) local_batch = args.batch_size // int(os.environ['WORLD_SIZE']) else: local_batch = args.batch_size traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1))) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG test_x = test_users.view(-1).split(args.valid_batch_size) test_y = test_items.view(-1).split(args.valid_batch_size) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, distributed=args.distributed) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}' .format(K=args.topk, hit_rate=hr, ndcg=ndcg)) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() # prepare data for epoch neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples) epoch_users = torch.cat((train_users,neg_users)) epoch_items = torch.cat((train_items,neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(len(epoch_users), device='cuda:{}'.format(args.local_rank)) epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] if args.distributed: epoch_users = torch.chunk(epoch_users, args.world_size)[args.local_rank] epoch_items = torch.chunk(epoch_items, args.world_size)[args.local_rank] epoch_label = torch.chunk(epoch_label, args.world_size)[args.local_rank] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) # only print progress bar on rank 0 num_batches = (len(epoch_indices) + args.batch_size - 1) // args.batch_size if args.local_rank == 0: qbar = tqdm.tqdm(range(num_batches)) else: qbar = range(num_batches) # handle extremely rare case where last batch size < number of worker if len(epoch_users_list) < num_batches: print("epoch_size % batch_size < number of worker!") exit(1) for i in qbar: # selecting input from prepared data user = epoch_users_list[i] item = epoch_items_list[i] label = epoch_label_list[i].view(-1,1) for p in model.parameters(): p.grad = None outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.fp16: fp_optimizer.step(loss, optimizer) else: loss.backward() optimizer.step() del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() mlperf_log.ncf_print(key=mlperf_log.EVAL_START) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}' .format(epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) run_stop_time = time.time() mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL) # easy way of tracking mlperf score if success: print("mlperf_score", run_stop_time - run_start_time)
def main(): log_hardware() args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) log_args(args) main_start_time = time.time() if args.seed is not None: torch.manual_seed(args.seed) # Save configuration to file timestamp = "{:.0f}".format(datetime.utcnow().timestamp()) run_dir = "./run/neumf/{}.{}".format(timestamp, args.local_rank) print("Saving results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) # more like load trigger timer now LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=args.valid_negative) # The default of np.random.choice is replace=True, so does pytorch random_() LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN) # sync worker before timing. if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() LOGGER.log(key=tags.RUN_START) run_start_time = time.time() # load not converted data, just seperate one for test train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) # get input data # get dims nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 train_users = train_ratings[:, 0] train_items = train_ratings[:, 1] del nb_maxs, train_ratings LOGGER.log(key=tags.INPUT_SIZE, value=len(train_users)) # produce things not change between epoch # mask for filtering duplicates with real sample # note: test data is removed before create mask, same as reference mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1) mat[train_users, train_items] = 0 # create label train_label = torch.ones_like(train_users, dtype=torch.float32) neg_label = torch.zeros_like(train_label, dtype=torch.float32) neg_label = neg_label.repeat(args.negative_samples) train_label = torch.cat((train_label, neg_label)) del neg_label if args.fp16: train_label = train_label.half() # produce validation negative sample on GPU all_test_users = test_ratings.shape[0] test_users = test_ratings[:, 0] test_pos = test_ratings[:, 1].reshape(-1, 1) test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1] # create items with real sample at last position test_users = test_users.reshape(-1, 1).repeat(1, 1 + args.valid_negative) test_items = torch.cat( (test_negs.reshape(-1, args.valid_negative), test_pos), dim=1) del test_ratings, test_negs # generate dup mask and real indice for exact same behavior on duplication compare to reference # here we need a sort that is stable(keep order of duplicates) # this is a version works on integer sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2] sum_item_indices = sorted_items.float() + indices.float() / len( indices[0]) #[1.75,1.25,1.0,2.5] indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3] stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2] # produce -1 mask dup_mask = (sorted_items[:, 0:-1] == sorted_items[:, 1:]) dup_mask = torch.cat( (torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask), dim=1) dup_mask = torch.gather(dup_mask, 1, stable_indices.sort()[1]) # produce real sample indices to later check in topk sorted_items, indices = (test_items != test_pos).sort() sum_item_indices = sorted_items.float() + indices.float() / len(indices[0]) indices_order = torch.sort(sum_item_indices)[1] stable_indices = torch.gather(indices, 1, indices_order) real_indices = stable_indices[:, 0] del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos if args.distributed: test_users = torch.chunk(test_users, args.world_size)[args.local_rank] test_items = torch.chunk(test_items, args.world_size)[args.local_rank] dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank] real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank] # make pytorch memory behavior more consistent later torch.cuda.empty_cache() LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size) LOGGER.log(key=tags.INPUT_ORDER) # we shuffled later with randperm print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - run_start_time, nb_users, nb_items, len(train_users), nb_users)) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], dropout=args.dropout) if args.fp16: model = model.half() print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph if args.fp16: fp_optimizer = Fp16Optimizer(model, args.loss_scale) params = fp_optimizer.fp32_params else: params = model.parameters() optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host LOGGER.log(key=tags.OPT_LR, value=args.learning_rate) LOGGER.log(key=tags.OPT_NAME, value="Adam") LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1) LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2) LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps) LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE) # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.distributed: model = DDP(model) local_batch = args.batch_size // int(os.environ['WORLD_SIZE']) else: local_batch = args.batch_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) train_users_per_worker = len(train_label) / int(os.environ['WORLD_SIZE']) train_users_begin = int(train_users_per_worker * args.local_rank) train_users_end = int(train_users_per_worker * (args.local_rank + 1)) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG test_x = test_users.view(-1).split(args.valid_batch_size) test_y = test_items.view(-1).split(args.valid_batch_size) if args.mode == 'test': state_dict = torch.load(args.checkpoint_path) model.load_state_dict(state_dict) begin = time.time() LOGGER.log(key=tags.EVAL_START, value=-1) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, distributed=args.distributed) val_time = time.time() - begin print( 'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, valid_time: {val_time:.4f}' .format(K=args.topk, hit_rate=hr, ndcg=ndcg, val_time=val_time)) LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": -1, "value": hr}) LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold) LOGGER.log(key=tags.EVAL_STOP, value=-1) if args.mode == 'test': return success = False max_hr = 0 LOGGER.log(key=tags.TRAIN_LOOP) train_throughputs = [] eval_throughputs = [] for epoch in range(args.epochs): LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples) LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() # prepare data for epoch neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples) epoch_users = torch.cat((train_users, neg_users)) epoch_items = torch.cat((train_items, neg_items)) del neg_users, neg_items # shuffle prepared data and split into batches epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format( args.local_rank)) epoch_indices += train_users_begin epoch_users = epoch_users[epoch_indices] epoch_items = epoch_items[epoch_indices] epoch_label = train_label[epoch_indices] epoch_users_list = epoch_users.split(local_batch) epoch_items_list = epoch_items.split(local_batch) epoch_label_list = epoch_label.split(local_batch) # only print progress bar on rank 0 num_batches = len(epoch_users_list) # handle extremely rare case where last batch size < number of worker if len(epoch_users) % args.batch_size < args.world_size: print("epoch_size % batch_size < number of worker!") exit(1) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users_list[batch_idx] item = epoch_items_list[batch_idx] label = epoch_label_list[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.fp16: fp_optimizer.backward(loss) else: loss.backward() if args.fp16: fp_optimizer.step(optimizer) else: optimizer.step() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_users) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) LOGGER.log(key='train_throughput', value=train_throughput) LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) LOGGER.log(key=tags.EVAL_START, value=epoch) hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1), num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold) LOGGER.log(key=tags.EVAL_STOP, value=epoch) eval_size = all_test_users * test_items.size(1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) LOGGER.log(key='eval_throughput', value=eval_throughput) if hr > max_hr and args.local_rank == 0: max_hr = hr print("New best hr! Saving the model to: ", args.checkpoint_path) torch.save(model.state_dict(), args.checkpoint_path) if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break LOGGER.log(key='best_train_throughput', value=max(train_throughputs)) LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs)) LOGGER.log(key='best_accuracy', value=max_hr) LOGGER.log(key='time_to_target', value=time.time() - main_start_time) LOGGER.log(key=tags.RUN_STOP, value={"success": success}) LOGGER.log(key=tags.RUN_FINAL)
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.metadata('train_throughput', { "name": 'train_throughput', 'format': ":.3e" }) dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"}) dllogger.metadata('train_epoch_time', { "name": 'train_epoch_time', 'format': ":.3f" }) dllogger.metadata('validation_epoch_time', { "name": 'validation_epoch_time', 'format': ":.3f" }) dllogger.metadata('eval_throughput', { "name": 'eval_throughput', 'format': ":.3e" }) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() feature_spec_path = os.path.join(args.data, args.feature_spec_file) feature_spec = FeatureSpec.from_yaml(feature_spec_path) trainset = dataloading.TorchTensorDataset(feature_spec, mapping_name='train', args=args) testset = dataloading.TorchTensorDataset(feature_spec, mapping_name='test', args=args) train_loader = dataloading.TrainDataloader(trainset, args) test_loader = dataloading.TestDataLoader(testset, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0] item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0] label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0] model = NeuMF( nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'], nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'], mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - start eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return # this should always be overridden if hr>0. # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring # to an uninitialized variable. max_hr = 0 best_epoch = 0 best_model_timestamp = time.time() train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() batch_dict_list = train_loader.get_epoch_data() num_batches = len(batch_dict_list) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j batch_dict = batch_dict_list[batch_idx] user_features = batch_dict[USER_CHANNEL_NAME] item_features = batch_dict[ITEM_CHANNEL_NAME] user_batch = user_features[user_feature_name] item_batch = item_features[item_feature_name] label_features = batch_dict[LABEL_CHANNEL_NAME] label_batch = label_features[label_feature_name] outputs = model(user_batch, item_batch) loss = traced_criterion(outputs, label_batch.view(-1, 1)).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del batch_dict_list train_time = time.time() - begin begin = time.time() epoch_samples = train_loader.length_after_augmentation train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - begin eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
def main(): from grace_dl.dist.helper import timer, volume, tensor_bits args = parse_args() init_distributed(args) if args.weak_scaling: args.batch_size *= args.world_size init_wandb(args) init_grace(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() if args.seed is not None: torch.manual_seed(args.seed) train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:0')) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:0')) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:0')) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') # if args.distributed: # model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) if args.local_rank == 0: print(model) print("{} parameters".format(utils.count_parameters(model))) # [print(parameter) for parameter in model.parameters()] if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] # broadcast model states from rank0 to other nodes !!! This is important! [torch.distributed.broadcast(p.data, src=0) for p in model.parameters()] # if args.local_rank == 0: # save_initial_state_path = os.path.join(args.checkpoint_dir, 'model_init.pth') # print("Saving the model to: ", save_initial_state_path) # torch.save(model.state_dict(), save_initial_state_path) for epoch in range(args.epochs): begin = time.time() train_time = 0 epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): batch_start = time.time() for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # check grad sparsity if args.sparsity_check: total_nonzero = 0 total_numel = 0 for index, (name, p) in enumerate(model.named_parameters()): sparsity = 1.0 - torch.sum( p.grad.data.abs() > 0).float() / p.grad.data.numel() total_nonzero += torch.sum(p.grad.data.abs() > 0).float() total_numel += p.grad.data.numel() if args.local_rank == 0: wandb.log( { f"{name}(sparsity)(numel={p.grad.data.numel()})": sparsity, }, commit=False) if args.local_rank == 0: wandb.log( { f"total_sparsity(numel={total_numel})": 1 - total_nonzero / total_numel, }, commit=True) # add grace just before optimizer.step() torch.cuda.synchronize() comm_start = time.time() for index, (name, p) in enumerate(model.named_parameters()): new_grad = args.grc.step(p.grad.data, name) p.grad.data = new_grad torch.cuda.synchronize() timer['comm'] = time.time() - comm_start # [torch.distributed.all_reduce(p.grad.data) for p in model.parameters()] # for param in model.parameters(): # dist.all_reduce(param.grad.data) # param.grad.data /= float(args.world_size) optimizer.step() for p in model.parameters(): p.grad = None if args.throughput: torch.cuda.synchronize() if args.log_time and args.local_rank == 0: timer['batch_time'] = time.time() - batch_start timer['computation'] = timer['batch_time'] - timer['comm'] print("Timer:", timer, '\n') timer['en/decoding'] = 0 timer['batch_time'] = 0 timer['computation'] = 0 timer['comm'] = 0 if args.log_volume and args.local_rank == 0: ratio = volume['compress'] / volume['nocompress'] volume['ratio_acc'].append(ratio) avg_ratio = sum(volume['ratio_acc']) / len(volume['ratio_acc']) print( f"Data volume:: compress {volume['compress']} no_compress {volume['nocompress']} ratio {ratio:.4f} avg_ratio {avg_ratio:.4f}" ) volume['compress'] = 0 volume['nocompress'] = 0 batch_throughput = args.batch_size / (time.time() - batch_start ) # global throughput train_time += time.time() - batch_start if (args.throughput or args.eval_at_every_batch) and args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.throughput and i == 3: break if args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.eval_at_every_batch: hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) if args.local_rank == 0: wandb.log({ "eval/hr@10": hr, }) del epoch_users, epoch_items, epoch_label # train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time if args.throughput: train_throughput = batch_throughput train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if args.local_rank == 0: wandb.log( { "train_epoch_time": train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput, 'train_throughput': train_throughput, }, commit=False) if not args.eval_at_every_batch: wandb.log({ "eval/hr@10": hr, }, commit=False) wandb.log({"epoch": epoch}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.throughput: break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple()) wandb.log({ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time })
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Create model model = NeuMF(2197225, 855776, mf_dim=64, mf_reg=0., mlp_layer_sizes=[256, 256, 128, 64], mlp_layer_regs=[0. for i in [256, 256, 128, 64]]) print(model) if use_cuda: # Move model and loss to GPU model = model.cuda() model.device = torch.device('cuda:{}'.format(0)) if args.load_ckp: ckp = torch.load(args.load_ckp) model.load_state_dict(ckp) all_embeding = [ n for n, m in model.named_modules() if isinstance(m, nn.Embedding) ] all_linear = [ n for n, m in model.named_modules() if isinstance(m, nn.Linear) ] all_relu = [n for n, m in model.named_modules() if isinstance(m, nn.ReLU)] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu + all_relu6 + all_linear + all_embeding replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Linear: ParameterModuleWrapperPost, nn.Embedding: ActivationModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) # mq.log_quantizer_state(ml_logger, -1) test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader( args.data) data = NcfData(test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user) cal_data = CalibrationSet('ml-20mx16x32/cal_set').cuda() cal_data.split(batch_size=10000) criterion = nn.BCEWithLogitsLoss(reduction='mean') criterion = criterion.cuda() print("init_method: {}, qtype {}".format(args.init_method, args.qtype)) # evaluate to initialize dynamic clipping loss = evaluate_calibration(model, cal_data, criterion) print("Initial loss: {:.4f}".format(loss)) # get clipping values init = get_clipping(mq) # evaluate hr, ndcg = validate(model, data) ml_logger.log_metric('HR init', hr, step='auto') # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) loss = run_inference_on_calibration(x, model, mq, cal_data, criterion) print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss)) res = opt.minimize(lambda scales: run_inference_on_calibration( scales, model, mq, cal_data, criterion), np.array(init), method=args.min_method, options=min_options, callback=local_search_callback) print(res) scales = res.x set_clipping(mq, scales, model.device) # evaluate hr, ndcg = validate(model, data) ml_logger.log_metric('HR Powell', hr, step='auto')