def inference_benchmark_nongraphed(model, data_loader, num_batches=100): model.eval() base_device = FLAGS.base_device latencies = [] y_true = [] y_score = [] with torch.no_grad(): for step, (numerical_features, categorical_features, click) in enumerate(data_loader): if step > num_batches: break step_start_time = time() numerical_features = numerical_features.to(base_device) if FLAGS.amp: numerical_features = numerical_features.half() categorical_features = categorical_features.to(device=base_device, dtype=torch.int64) inference_result = model(numerical_features, categorical_features).squeeze() torch.cuda.synchronize() step_time = time() - step_start_time if step >= FLAGS.benchmark_warmup_steps: latencies.append(step_time) y_true.append(click) y_score.append(inference_result.reshape([-1]).clone()) y_true = torch.cat(y_true) y_score = torch.sigmoid(torch.cat(y_score)).float() auc = utils.roc_auc_score(y_true, y_score) print('auc: ', auc) return latencies
def evaluate(model, loss_fn, data_loader): """Test dlrm model Args: model (dlrm): loss_fn (torch.nn.Module): Loss function data_loader (torch.utils.data.DataLoader): """ model.eval() print_freq = FLAGS.print_freq prefetching_enabled = is_data_prefetching_enabled() steps_per_epoch = len(data_loader) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'loss', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) if prefetching_enabled: data_stream = torch.cuda.Stream() with torch.no_grad(): y_true = [] y_score = [] timer = utils.StepTimer() timer.click() input_pipeline = iter(data_loader) if prefetching_enabled: input_pipeline = prefetcher(input_pipeline, data_stream) for step, (numerical_features, categorical_features, click) in enumerate(input_pipeline): if FLAGS.amp: numerical_features = numerical_features.half() if prefetching_enabled: torch.cuda.synchronize() output = model(numerical_features, categorical_features).squeeze() loss = loss_fn(output, click) y_true.append(click) y_score.append(output) loss_value = loss.item() timer.click() if timer.measured is not None: metric_logger.update(loss=loss_value, step_time=timer.measured) if step % print_freq == 0 and step > 0: metric_logger.print( header=f"Test: [{step}/{steps_per_epoch}]") y_true = torch.cat(y_true) y_score = torch.cat(y_score) before_auc_timestamp = time() auc = utils.roc_auc_score(y_true=y_true, y_score=y_score) print(f'AUC computation took: {time() - before_auc_timestamp:.2f} [s]') model.train() return metric_logger.loss.global_avg, auc, metric_logger.step_time.avg
def dist_evaluate(model, data_loader): """Test distributed DLRM model Args: model (DistDLRM): data_loader (torch.utils.data.DataLoader): """ model.eval() device = FLAGS.base_device world_size = dist.get_world_size() batch_sizes_per_gpu = [ FLAGS.test_batch_size // world_size for _ in range(world_size) ] test_batch_size = sum(batch_sizes_per_gpu) if FLAGS.test_batch_size != test_batch_size: print(f"Rounded test_batch_size to {test_batch_size}") print(f"Batch sizes per GPU {batch_sizes_per_gpu}") # Test bach size could be big, make sure it prints default_print_freq = max(524288 * 100 // test_batch_size, 1) print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq steps_per_epoch = len(data_loader) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) with torch.no_grad(): timer = utils.StepTimer() # ROC can be computed per batch and then compute AUC globally, but I don't have the code. # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn y_true = [] y_score = [] data_stream = torch.cuda.Stream() batch_iter = prefetcher(iter(data_loader), data_stream) timer.click() for step in range(len(data_loader)): numerical_features, categorical_features, click = next(batch_iter) torch.cuda.synchronize() last_batch_size = None if click.shape[0] != test_batch_size: # last batch last_batch_size = click.shape[0] logging.warning("Pad the last test batch of size %d to %d", last_batch_size, test_batch_size) padding_size = test_batch_size - last_batch_size if numerical_features is not None: padding_numerical = torch.empty( padding_size, numerical_features.shape[1], device=numerical_features.device, dtype=numerical_features.dtype) numerical_features = torch.cat( (numerical_features, padding_numerical), dim=0) if categorical_features is not None: padding_categorical = torch.ones( padding_size, categorical_features.shape[1], device=categorical_features.device, dtype=categorical_features.dtype) categorical_features = torch.cat( (categorical_features, padding_categorical), dim=0) output = model(numerical_features, categorical_features, batch_sizes_per_gpu).squeeze() output_receive_buffer = torch.empty(test_batch_size, device=device) torch.distributed.all_gather( list(output_receive_buffer.split(batch_sizes_per_gpu)), output) if last_batch_size is not None: output_receive_buffer = output_receive_buffer[:last_batch_size] if FLAGS.auc_device == "CPU": click = click.cpu() output_receive_buffer = output_receive_buffer.cpu() y_true.append(click) y_score.append(output_receive_buffer) timer.click() if timer.measured is not None: metric_logger.update(step_time=timer.measured) if step % print_freq == 0 and step > 0: metric_logger.print( header=f"Test: [{step}/{steps_per_epoch}]") if is_main_process(): auc = utils.roc_auc_score( torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float())) else: auc = None torch.distributed.barrier() model.train() return auc
def inference_benchmark_graphed(model, data_loader, num_batches=100): model.eval() base_device = FLAGS.base_device latencies = [] data_iter = iter(data_loader) numerical, categorical, _ = next(data_iter) # Warmup before capture s = torch.cuda.Stream() static_numerical = numerical.to(base_device) static_categorical = categorical.to(device=base_device, dtype=torch.int64) s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): for i in range(10): if FLAGS.amp: numerical = static_numerical.half() else: numerical = static_numerical inference_result = model(numerical, static_categorical).squeeze() torch.cuda.synchronize() # Graph capture graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): if FLAGS.amp: numerical = static_numerical.half() else: numerical = static_numerical inference_result = model(numerical, static_categorical).squeeze() torch.cuda.synchronize() # Inference y_true = [] y_score = [] with torch.no_grad(): for step, (numerical_features, categorical_features, click) in enumerate(data_loader): if step > num_batches: break torch.cuda.synchronize() step_start_time = time() numerical_features = numerical_features.to(base_device) categorical_features = categorical_features.to(device=base_device, dtype=torch.int64) static_categorical.copy_(categorical_features) static_numerical.copy_(numerical_features) graph.replay() torch.cuda.synchronize() step_time = time() - step_start_time if step >= FLAGS.benchmark_warmup_steps: latencies.append(step_time) y_true.append(click) y_score.append(inference_result.reshape([-1]).clone()) y_true = torch.cat(y_true) y_score = torch.sigmoid(torch.cat(y_score)).float() auc = utils.roc_auc_score(y_true, y_score) print('auc: ', auc) return latencies