def _test_conv_bf16_base(self, dim): conv_module = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d} input_shapes = {1: (224,), 2: (224, 224), 3: (55, 55, 55)} options = itertools.product([True, False], [1, 2], [1, 4]) for bias, dilation, groups in options: N = torch.randint(3, 10, (1,)).item() M = torch.randint(1, 3, (1,)).item() * groups C = torch.randint(1, 3, (1,)).item() * groups x_shape = (N, C) + input_shapes[dim] x = torch.randn(x_shape, dtype=torch.float32) conv = conv_module[dim](in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, dilation=dilation, bias=bias, groups=groups).float() x_bf16 = x.bfloat16() if has_bf16_support(): mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv)) mkldnn_conv_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(conv), torch.bfloat16) y = mkldnn_conv(x.to_mkldnn()).to_dense() y_bf16 = mkldnn_conv_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32) self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3) else: msg = r"bf16 path needs the cpu support avx512bw, avx512vl and avx512dq" with self.assertRaisesRegex(RuntimeError, msg): mkldnn_conv_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(conv), torch.bfloat16) y_bf16 = mkldnn_conv_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32)
def main_4(): # check MKL-DNN print(*torch.__config__.show().split('\n'), sep = '\n') print('=======') orig_model = models.resnet101(False) input1 = torch.rand(100, 3, 224, 224) result = orig_model(input1) result = orig_model(input1) result = orig_model(input1) result = orig_model(input1) start_time = timeit.default_timer() result = orig_model(input1) run_time = timeit.default_timer() - start_time print('Python (CPU): {}'.format(run_time)) orig_model.eval() mkldnn_model = mkldnn_utils.to_mkldnn(orig_model) input1 = input1.to_mkldnn() answer = torch.zeros(100, 1000).to_mkldnn() result = mkldnn_model(input1) result = mkldnn_model(input1) result = mkldnn_model(input1) result = mkldnn_model(input1) start_time = timeit.default_timer() result = mkldnn_model(input1) run_time = timeit.default_timer() - start_time print('Python (MKL-DNN): {}'.format(run_time))
def test_0_dimension_tensor(self): x = torch.rand([20, 20, 1, 1], dtype=torch.float) y = torch.rand([20, 20, 0, 1], dtype=torch.float) # unary ops work without modification out_relu = torch.relu(y) out_relu_mkldnn = torch.relu(y.to_mkldnn()).to_dense() self.assertEqual(out_relu, out_relu_mkldnn) out_mul = x * y out_mul_mkldnn = (x.to_mkldnn() * y.to_mkldnn()).to_dense() self.assertEqual(out_mul, out_mul_mkldnn) out_add = x + y out_add_mkldnn = (x.to_mkldnn() + y.to_mkldnn()).to_dense() self.assertEqual(out_add, out_add_mkldnn) x.requires_grad_(True) y.requires_grad_(True) with self.assertRaisesRegex(RuntimeError, "0-dimension Tensor in training"): x.to_mkldnn() + y.to_mkldnn() with self.assertRaisesRegex(RuntimeError, "must match"): torch.rand([5]).to_mkldnn() + torch.rand([0]).to_mkldnn() C = 7 m = torch.nn.Conv2d(C, C, 3) x = torch.randn(0, C, C, 8, dtype=torch.float) out_eager = m(x) out_mkldnn = mkldnn_utils.to_mkldnn(m)(x) self.assertEqual(out_eager, out_mkldnn)
def setup_data_and_model(self): x_train, y_train = self.load_data() args = [ self.device, self.params["tensor_layout"], self.params["problem"]["precision"], ] self.x_train = [set_batch_device_precision(i, *args) for i in x_train] self.y_train = [set_batch_device_precision(i, *args) for i in y_train] if self.params["problem"]["precision"] == "FP16": self.net.half() if self.params["backend"] == "DNNL": torch.backends.mkldnn.enabled = True self.net.eval( ) # This is to make it not fail when DNLL does not support train if self.params["tensor_layout"] == "DNNL": self.net = mkldnn_utils.to_mkldnn(self.net) else: logger.warning("Using DNNL backend without DNNL tensors") else: if self.params["backend"] == "native": torch.backends.mkldnn.enabled = False assert self.params["tensor_layout"] == "native" else: raise RuntimeError("Unknown backend")
def _test_conv_base(self, dim): conv_module = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d} input_shapes = {1: (224,), 2: (224, 224), 3: (55, 55, 55)} options = itertools.product([True, False], [1, 2], [1, 4]) for bias, dilation, groups in options: N = torch.randint(3, 10, (1,)).item() M = torch.randint(1, 3, (1,)).item() * groups C = torch.randint(1, 3, (1,)).item() * groups x_shape = (N, C) + input_shapes[dim] x = torch.randn(x_shape, dtype=torch.float32) conv = conv_module[dim](in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, dilation=dilation, bias=bias, groups=groups).float() mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv)) with torch.backends.mkldnn.flags(enabled=False): y_aten = conv(x) y_mkldnn = mkldnn_conv(x.to_mkldnn()).to_dense() self.assertEqual(y_aten, y_mkldnn) self._test_serialization(mkldnn_conv, (x.to_mkldnn(),)) self._test_tracing(mkldnn_conv, (x.to_mkldnn(),))
def _test_batch_norm_base(self, dim, channels, input): bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d} bn = bn_module[dim](channels).float().train(False) mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn)) self.assertEqual(bn(input), mkldnn_bn(input.to_mkldnn()).to_dense()) self._test_serialization(mkldnn_bn, (input.to_mkldnn(), )) self._test_tracing(mkldnn_bn, (input.to_mkldnn(), ))
def _test_imagenet_model(self, model): model = model.train(False).float() mkldnn_model = mkldnn_utils.to_mkldnn(copy.deepcopy(model)) x = torch.randn(1, 3, 224, 224, dtype=torch.float32) with torch.no_grad(): self.assertEqual( model(x), mkldnn_model(x.to_mkldnn()).to_dense(), )
def test_linear(self): in_features = torch.randint(3, 10, (1, )).item() out_features = torch.randint(3, 100, (1, )).item() x = torch.randn(3, in_features, dtype=torch.float32) * 10 for bias in [True, False]: linear = torch.nn.Linear(in_features, out_features).float() mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear)) self.assertEqual(linear(x), mkldnn_linear(x.to_mkldnn()).to_dense())
def test_linear_bf16(self): in_features = torch.randint(3, 10, (1,)).item() out_features = torch.randint(3, 100, (1,)).item() x = torch.randn(3, in_features, dtype=torch.float32) * 10 x_bf16 = x.bfloat16() for bias in [True, False]: linear = torch.nn.Linear(in_features, out_features, bias=bias).float() mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear)) mkldnn_linear_bf16 = mkldnn_utils.to_mkldnn(copy.deepcopy(linear), torch.bfloat16) if has_bf16_support(): y = mkldnn_linear(x.to_mkldnn()).to_dense() y_bf16 = mkldnn_linear_bf16(x_bf16.to_mkldnn()).to_dense(torch.float32) self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3) else: msg = "mkldnn_linear: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq" self.assertRaisesRegex(RuntimeError, msg, lambda: mkldnn_linear_bf16(x_bf16.to_mkldnn()))
def test_batch_norm2d(self): N = torch.randint(3, 10, (1, )).item() C = torch.randint(3, 100, (1, )).item() x = torch.randn(N, C, 35, 45, dtype=torch.float32) * 10 # TODO: support training for train in [False]: bn = torch.nn.BatchNorm2d(C).float().train(train) mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn)) self.assertEqual(bn(x), mkldnn_bn(x.to_mkldnn()).to_dense())
def _test_conv_base(self, dim): conv_module = { 1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d } input_shapes = {1: (224, ), 2: (224, 224), 3: (55, 55, 55)} options = itertools.product([True, False], [True, False], [1, 2], [1, 4]) for train, bias, dilation, groups in options: N = torch.randint(3, 10, (1, )).item() M = torch.randint(1, 3, (1, )).item() * groups C = torch.randint(1, 3, (1, )).item() * groups x_shape = (N, C) + input_shapes[dim] x = torch.randn(x_shape, dtype=torch.float32) conv = conv_module[dim](in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, dilation=dilation, bias=bias, groups=groups).float() x1 = x.clone() x2 = x.clone().to_mkldnn() if not train: mkldnn_conv = mkldnn_utils.to_mkldnn(copy.deepcopy(conv)) elif train and dim != 1: # TODO: enable conv1d training. x1.requires_grad_() x2.requires_grad_() mkldnn_conv = copy.deepcopy(conv) with torch.backends.mkldnn.flags(enabled=False): y_aten = conv(x1) if train and dim != 1: loss1 = y_aten.sum() loss1.backward() if not train or (train and dim != 1): y_mkldnn = mkldnn_conv(x2).to_dense() self.assertEqual(y_aten, y_mkldnn) if not train: self._test_serialization(mkldnn_conv, (x.to_mkldnn(), )) self._test_tracing(mkldnn_conv, (x.to_mkldnn(), )) elif dim != 1: loss2 = y_mkldnn.sum() loss2.backward() self.assertTrue(x2.grad.is_mkldnn) self.assertEqual(x1.grad, x2.grad.to_dense()) self.assertEqual(conv.weight.grad, mkldnn_conv.weight.grad, atol=1e-3, rtol=1e-3) if bias: self.assertEqual(conv.bias.grad, mkldnn_conv.bias.grad)
def network(self, x): """ convert imgs to torch/mxnet and run network model and return numpy """ X = self._to_device(x) if self.torch: self.net.eval() if self.mkldnn: self.net = mkldnn_utils.to_mkldnn(self.net) y, style = self.net(X) if self.mkldnn: self.net.to(torch_CPU) y = self._from_device(y) style = self._from_device(style) return y,style
def _test_batch_norm_bf16_base(self, dim, channels, input): bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d} x_bf16 = input.bfloat16() # TODO: support training for train in [False]: bn = bn_module[dim](channels).float().train(train) mkldnn_bn = mkldnn_utils.to_mkldnn(copy.deepcopy(bn)) if has_bf16_support(): y = bn(input.to_mkldnn().to_dense()) y_bf16 = bn(input.to_mkldnn().to_dense(torch.float)) self.assertEqual(y, y_bf16, atol=1e-1, rtol=1e-3) else: msg = "mkldnn_batch_norm: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq" self.assertRaisesRegex(RuntimeError, msg, lambda: bn(x_bf16.to_mkldnn()))
def test_reshape_blocked_format(self): # construct an mkldnn blocked tensor with mkldnn conv2d C = 7 m = mkldnn_utils.to_mkldnn(torch.nn.Conv2d(C, C, 3)) x = torch.randn(1, C, 8, 8).to_mkldnn() # mkldnn tensor w/ blocked format y_block = m(x) # aten tensor w/ plain format y_plain = y_block.to_dense() y_block_reshape = y_block.reshape(C, -1) y_plain_reshape = y_plain.reshape(C, -1) self.assertEqual(y_plain_reshape, y_block_reshape.to_dense())
def pytorch_benchmark(net, width, height, number_iter, input_folder, need_output=False, output_folder='', task_type='', batch_size=1): filenames = os.listdir(input_folder) inference_times = [] number_iter = (number_iter + batch_size - 1) // batch_size images, counts = load_images(width, height, input_folder, number_iter * batch_size) net = mkldnn_utils.to_mkldnn(net) t0_total = time() for i in range(number_iter): a = (i * batch_size) % len(images) b = (((i + 1) * batch_size - 1) % len(images)) + 1 blob = images[a:b] blob = blob.to_mkldnn() t0 = time() output = net(blob) t1 = time() output = output.to_dense() if (need_output == True and batch_size == 1): # Generate output name output_filename = str( os.path.splitext(os.path.basename(filenames[i]))[0]) + '.npy' output_filename = os.path.join(os.path.dirname(output_folder), output_filename) # Save output print(output.shape) print(np.argmax(np.array(output)[0])) #np.savetxt(output_filename, output) inference_times.append(t1 - t0) t1_total = time() inference_total_time = t1_total - t0_total return inference_times, inference_total_time
def main(): ''' The following 3 components are required to perform training. 1. model: Instantiate model class 2. optim: Optimization function for update topology parameters during training 3. crite: Criterion function to minimize loss ''' model = TestModel() optim = torch.optim.SGD(model.parameters(), lr=0.01) crite = nn.MSELoss(reduction='sum') ''' 1. Instantiate the Dataset class defined before 2. Use torch.utils.data.DataLoader to load data from the Dataset instance ''' train_data = TestDataset() trainLoader = DataLoader(train_data, batch_size=BS_TRAIN) test_data = TestDataset(train=False) testLoader = DataLoader(test_data, batch_size=BS_TEST) ''' Perform training and inference Use model.train() to set the model into train mode. Use model.eval() to set the model into inference mode. Use for loop with enumerate(instance of DataLoader) to go through the whole dataset for training/inference. ''' for i in range(0, EPOCHNUM - 1): model.train() for batch_index, (data, y_ans) in enumerate(trainLoader): ''' 1. Clear parameters of optimization function 2. Do forward-propagation 3. Calculate loss of the forward-propagation with the criterion function 4. Calculate gradients with the backward() function 5. Update parameters of the model with the optimization function ''' optim.zero_grad() y = model(data) loss = crite(y, y_ans) loss.backward() optim.step() model.eval() ''' 1. User is suggested to use JIT mode to get best performance with MKL-DNN with minimum change of Pytorch code. User may need to pass an explicit flag or invoke a specific MKL-DNN optimization pass. The PyTorch MKL-DNN JIT backend is under development (RFC link https://github.com/pytorch/pytorch/issues/23657), so the example below is given in imperative mode. 2. To have model accelerated by MKL-DNN under imperative mode, user needs to explicitly insert format conversion for MKL-DNN operations using tensor.to_mkldnn() and to_dense(). For best result, user needs to insert the format conversion on the boundary of a sequence of MKL-DNN operations. This could boost performance significantly. 3. For inference task, user needs to prepack the model▒~@~Ys weight using mkldnn_utils.to_mkldnn(model) to save the weight format conversion overhead. It could bring good performance gain sometime for single batch inference. ''' model_mkldnn = mkldnn.to_mkldnn(model) for batch_index, data in enumerate(testLoader): y = model_mkldnn(data.to_mkldnn())
def test_conv2d(self): for groups in [1, 4]: N = torch.randint(3, 10, (1, )).item() C = torch.randint(1, 3, (1, )).item() * groups M = torch.randint(1, 3, (1, )).item() * groups x = torch.randn(N, C, 224, 224, dtype=torch.float32) * 100 for bias in [True, False]: conv2d = torch.nn.Conv2d(in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, bias=bias, groups=groups).float() mkldnn_conv2d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv2d)) self.assertEqual(conv2d(x), mkldnn_conv2d(x.to_mkldnn()).to_dense())
def network(self, x, return_conv=False): """ convert imgs to torch/mxnet and run network model and return numpy """ X = self._to_device(x) if self.torch: self.net.eval() if self.mkldnn: self.net = mkldnn_utils.to_mkldnn(self.net) y, style, conv = self.net(X) if self.mkldnn: self.net.to(torch_CPU) y = self._from_device(y) style = self._from_device(style) if return_conv: conv = self._from_device(conv) y = np.concatenate((y, conv), axis=1) return y, style
def test_conv3d(self): for groups in [1, 4]: N = torch.randint(3, 10, (1, )).item() C = torch.randint(1, 3, (1, )).item() * groups M = torch.randint(1, 3, (1, )).item() * groups x = torch.randn(N, C, 55, 55, 55, dtype=torch.float32) for bias in [True, False]: conv3d = torch.nn.Conv3d(in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, bias=bias, groups=groups).float() mkldnn_conv3d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv3d)) with torch.backends.mkldnn.flags(enabled=False): y_aten = conv3d(x) y_mkldnn = mkldnn_conv3d(x.to_mkldnn()).to_dense() self.assertEqual(y_aten, y_mkldnn) self._test_serialization(mkldnn_conv3d, (x.to_mkldnn(), )) self._test_tracing(mkldnn_conv3d, (x.to_mkldnn(), ))
def test_conv2d(self): options = itertools.product([1, 4], [True, False], [1, 2]) for groups, bias, dilation in options: N = torch.randint(3, 10, (1, )).item() C = torch.randint(1, 3, (1, )).item() * groups M = torch.randint(1, 3, (1, )).item() * groups x = torch.randn(N, C, 224, 224, dtype=torch.float32) conv2d = torch.nn.Conv2d(in_channels=C, out_channels=M, kernel_size=3, stride=2, padding=1, dilation=dilation, bias=bias, groups=groups).float() mkldnn_conv2d = mkldnn_utils.to_mkldnn(copy.deepcopy(conv2d)) with torch.backends.mkldnn.flags(enabled=False): y_aten = conv2d(x) y_mkldnn = mkldnn_conv2d(x.to_mkldnn()).to_dense() self.assertEqual(y_aten, y_mkldnn) self._test_serialization(mkldnn_conv2d, (x.to_mkldnn(), )) self._test_tracing(mkldnn_conv2d, (x.to_mkldnn(), ))
def benchmark(): # benchmark settings parser = argparse.ArgumentParser(description='PyTorch Convnet Benchmark') parser.add_argument('--arch', action='store', default='all', choices=archs_list + ['all'], help='model name can be specified. all is default.') parser.add_argument('--no-cuda', action='store_true', default=False, help='disable CUDA') parser.add_argument('--mkldnn', action='store_true', default=False, help='use mkldnn weight cache') parser.add_argument('--inference', action='store_true', default=False, help='run inference only') parser.add_argument('--single-batch-size', action='store_true', default=False, help='single batch size') parser.add_argument('--print-iteration-time', action='store_true', default=False, help='print iteration time') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() arch_dict = { args.arch: archs[args.arch] } if args.arch in archs_list else archs # by huiming, support one or all models. if args.cuda: import torch.backends.cudnn as cudnn cudnn.benchmark = True cudnn.deterministic = True kernel = 'cudnn' p = subprocess.check_output('nvidia-smi --query-gpu=name --format=csv', shell=True) device_name = str(p).split('\\n')[1] else: kernel = 'nn' p = subprocess.check_output( 'cat /proc/cpuinfo | grep name | head -n 1', shell=True) device_name = str(p).split(':')[1][:-3] print('Running on device: %s' % (device_name)) print('Running on torch: %s' % (torch.__version__)) print('Running on torchvision: %s\n' % (torchvision.__version__)) def _time(): if args.cuda: torch.cuda.synchronize() return time.time() for arch, sizes in arch_dict.items(): if args.mkldnn and arch != 'resnet50' and arch != 'resnext101_32x8d': continue if arch == 'unet3d': batch_size, c, d, h, w = sizes[0], sizes[1], sizes[2], sizes[ 3], sizes[4] batch_size = 1 if args.single_batch_size else batch_size print('ModelType: %s, Kernels: %s Input shape: %dx%dx%dx%dx%d' % (arch, kernel, batch_size, c, d, h, w)) data = torch.randn(batch_size, c, d, h, w) else: batch_size, c, h, w = sizes[0], sizes[1], sizes[2], sizes[3] batch_size = 64 if arch is 'resnet50' and args.inference else batch_size batch_size = 1 if args.single_batch_size else batch_size print('ModelType: %s, Kernels: %s Input shape: %dx%dx%dx%d' % (arch, kernel, batch_size, c, h, w)) data = torch.randn(batch_size, c, h, w) target = torch.arange(1, batch_size + 1).long() net = models.__dict__[arch]( ) # no need to load pre-trained weights for dummy data optimizer = optim.SGD(net.parameters(), lr=0.01) criterion = nn.CrossEntropyLoss() if args.cuda: data, target = data.cuda(), target.cuda() net.cuda() criterion = criterion.cuda() if args.mkldnn: data = data.to_mkldnn() if args.inference: net = mkldnn_utils.to_mkldnn(net) if args.inference: net.eval() else: net.train() net.aux_logits = False for i in range(nDryRuns): optimizer.zero_grad() # zero the gradient buffers output = net(data) if not args.inference: if args.mkldnn: output = output.to_dense() loss = output.sum() / 1e6 if 'unet' in arch else criterion( output, target) loss.backward() optimizer.step() # Does the update time_fwd, time_bwd, time_upt = 0, 0, 0 for i in range(steps): optimizer.zero_grad() # zero the gradient buffers t1 = _time() output = net(data) t2 = _time() if not args.inference: if args.mkldnn: output = output.to_dense() loss = output.sum() / 1e6 if 'unet' in arch else criterion( output, target) loss.backward() t3 = _time() optimizer.step() # Does the update t4 = _time() time_fwd = time_fwd + (t2 - t1) if args.print_iteration_time: print("%-30s %d: %10.2f ms" % ('forward iteration', i, (t2 - t1) * 1000)) if not args.inference: time_bwd = time_bwd + (t3 - t2) time_upt = time_upt + (t4 - t3) time_fwd_avg = time_fwd / steps * 1000 time_bwd_avg = time_bwd / steps * 1000 time_upt_avg = time_upt / steps * 1000 # update not included! time_total = time_fwd_avg + time_bwd_avg print("%-30s %10s %10.2f (ms) %10.2f (imgs/s)" % (kernel, ':forward:', time_fwd_avg, batch_size * 1000 / time_fwd_avg)) print("%-30s %10s %10.2f (ms)" % (kernel, ':backward:', time_bwd_avg)) print("%-30s %10s %10.2f (ms)" % (kernel, ':update:', time_upt_avg)) print("%-30s %10s %10.2f (ms) %10.2f (imgs/s)" % (kernel, ':total:', time_total, batch_size * 1000 / time_total))
def validate(val_loader, model, criterion, args, is_INT8=False, is_calibration=False): if is_calibration: iterations = args.iter_calib warmup = 0 else: iterations = args.iterations warmup = args.warmup_iterations batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() if args.evaluate and args.mkldnn and not args.cuda and not is_INT8: if args.bf16: model = mkldnn_utils.to_mkldnn(model, torch.bfloat16) else: model = mkldnn_utils.to_mkldnn(model) # TODO using mkldnn weight cache if args.dummy: images = torch.randn(args.batch_size, 3, 224, 224) target = torch.arange(1, args.batch_size + 1).long() if not is_INT8: if args.gpu is not None and args.cuda: images = images.cuda(args.gpu, non_blocking=True) if args.cuda: target = target.cuda(args.gpu, non_blocking=True) if args.bf16 and not args.cuda: images = images.to_mkldnn(torch.bfloat16) elif args.mkldnn and not args.cuda: images = images.to_mkldnn() if args.ipex: images = images.to(device='dpcpp:0') number_iter = len(val_loader) with torch.no_grad(): for i in range(number_iter): if not args.evaluate or iterations == 0 or i < iterations + warmup: if i >= warmup: end = time.time() # compute output output = model(images) # measure elapsed time if i >= warmup: batch_time.update(time.time() - end) if args.mkldnn and not args.cuda and not is_INT8: output = output.to_dense() loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) if i % args.print_freq == 0: progress.display(i) elif i == iterations + warmup: break if args.profile: # print("export profiling file to {}".format(torch.backends.quantized.engine + "_result.json")) with torch.autograd.profiler.profile() as prof: output = model(images) prof.export_chrome_trace(torch.backends.quantized.engine + "_result.json") table_res = prof.key_averages().table(sort_by="cpu_time_total") print(table_res) save_profile_result( torch.backends.quantized.engine + "_result_average.xlsx", table_res) # TODO: this should also be done with the ProgressMeter if args.evaluate: batch_size = val_loader.batch_size latency = batch_time.avg / batch_size * 1000 perf = batch_size / batch_time.avg print('inference latency %3.0f ms' % latency) print('inference performance %3.0f fps' % perf) print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) else: with torch.no_grad(): for i, (images, target) in enumerate(val_loader): if not args.evaluate or iterations == 0 or i < iterations + warmup: if i >= warmup: end = time.time() if not is_INT8: if args.gpu is not None and args.cuda: images = images.cuda(args.gpu, non_blocking=True) if args.cuda: target = target.cuda(args.gpu, non_blocking=True) if args.bf16 and not args.cuda: images = images.to_mkldnn(torch.bfloat16) elif args.mkldnn and not args.cuda: images = images.to_mkldnn() if args.ipex: images = images.to(device='dpcpp:0') # compute output output = model(images) # measure elapsed time if i >= warmup: batch_time.update(time.time() - end) if args.mkldnn and not args.cuda and not is_INT8: output = output.to_dense() loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) if i % args.print_freq == 0: progress.display(i) elif i == iterations + warmup: break if args.profile: # print("export profiling file to {}".format(torch.backends.quantized.engine + "_result.json")) with torch.autograd.profiler.profile() as prof: output = model(images) prof.export_chrome_trace(torch.backends.quantized.engine + "_result.json") table_res = prof.key_averages().table(sort_by="cpu_time_total") print(table_res) save_profile_result( torch.backends.quantized.engine + "_result_average.xlsx", table_res) # TODO: this should also be done with the ProgressMeter if args.evaluate: batch_size = val_loader.batch_size latency = batch_time.avg / batch_size * 1000 perf = batch_size / batch_time.avg print('inference latency %3.0f ms' % latency) print('inference performance %3.0f fps' % perf) print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) return top1.avg
def evaluate(args, model, tokenizer, prefix="", calibration=False): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) dataset_cached = "./dataset_cached" if not os.path.exists(dataset_cached): os.makedirs(dataset_cached) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) print(" Batch size = %d" % args.eval_batch_size) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) all_results = [] evalTime = 0 nb_eval_steps = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} if args.model_type != 'distilbert': inputs[ 'token_type_ids'] = None if args.model_type == 'xlm' else batch[ 2] # XLM don't use segment_ids example_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) if nb_eval_steps >= args.warmup: start_time = timeit.default_timer() outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult(unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if nb_eval_steps >= args.warmup: evalTime += (timeit.default_timer() - start_time) nb_eval_steps += 1 if args.iter > 0 and nb_eval_steps >= (args.warmup + args.iter): break if nb_eval_steps >= args.warmup: perf = (nb_eval_steps - args.warmup) * args.eval_batch_size / evalTime if args.eval_batch_size == 1: print('Latency: %.3f ms' % (evalTime / (nb_eval_steps - args.warmup) * 1000)) print("Evaluation done in total %f secs (Throughput: %f samples/sec)" % (evalTime, perf)) else: logger.info( "*****no performance, please check dataset length and warmup number *****" ) # Compute predictions output_prediction_file = os.path.join(dataset_cached, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( dataset_cached, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: output_null_log_odds_file = os.path.join( dataset_cached, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) elif not calibration and args.iter == 0: predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) # Compute the F1 and exact scores. if not calibration and args.iter == 0: results = squad_evaluate(examples, predictions) bert_task_acc_keys = ['best_f1', 'f1', 'mcc', 'spearmanr', 'acc'] for key in bert_task_acc_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) else: results = None return results, perf
def inference(model, dataloader, datatype, args): batch_time = AverageMeter('Time', ':6.3f') batch_size = args.batch_size warmup_iters = args.warmup_iterations max_iters = args.max_iterations if dataloader is None else len(dataloader) model.eval() coco = get_coco_api_from_dataset(dataloader.dataset) iou_types = ["bbox"] iou_types.append("segm") coco_evaluator = CocoEvaluator(coco, iou_types) if args.ipex: import intel_extension_for_pytorch as ipex model = model.to(memory_format=torch.channels_last) model = ipex.optimize(model, dtype=datatype, level="O1", conv_bn_folding=False, replace_dropout_with_identity=False) model.backbone = ipex.optimize(model.backbone, dtype=datatype, level="O1") else: if args.jit: model = model.to(memory_format=torch.channels_last) else: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model, dtype=datatype) if args.jit: x = torch.randn(batch_size, 3, 1200, 1200).to(memory_format=torch.channels_last) if args.precision == "bf16": with torch.cpu.amp.autocast(), torch.no_grad(): model.backbone = torch.jit.trace(model.backbone, x, strict=False) model.backbone = torch.jit.freeze(model.backbone) else: with torch.no_grad(): model.backbone = torch.jit.trace(model.backbone, x, strict=False) model.backbone = torch.jit.freeze(model.backbone) with torch.no_grad(): if dataloader is None: print( "Models for detection tasks need to use real dataset. You need to specify coco dataset. " ) exit(1) else: for i, batch in enumerate(dataloader): images = batch[0] if not args.ipex and not args.jit: images = list(img.to(datatype) for img in images) if args.ipex and args.precision == "bf16": with torch.cpu.amp.autocast(): if i == warmup_iters: with profile( activities=[ProfilerActivity.CPU], record_shapes=True ) as prof, record_function("model_inference"): output = model(images) else: output = model(images) else: if i == warmup_iters: with profile( activities=[ProfilerActivity.CPU], record_shapes=True) as prof, record_function( "model_inference"): output = model(images) else: output = model(images) if i > warmup_iters: break for i, batch in enumerate(dataloader): images = batch[0] end = time.time() if not args.ipex and not args.jit: images = list(img.to(datatype) for img in images) if args.ipex and args.precision == "bf16": with torch.cpu.amp.autocast(): output = model(images) else: output = model(images) batch_time.update(time.time() - end) output = [{k: v.to(torch.float32) for k, v in t.items()} for t in output] res = { target["image_id"].item(): output for target, output in zip(batch[1], output) } coco_evaluator.update(res) if max_iters != -1 and i >= max_iters: break print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) latency = batch_time.avg / batch_size * 1000 perf = batch_size / batch_time.avg coco_evaluator.synchronize_between_processes() coco_evaluator.accumulate() coco_evaluator.summarize() print("Bbox AP: {:.5f} ".format(coco_evaluator.coco_eval['bbox'].stats[0])) print("Segm AP: {:.5f} ".format(coco_evaluator.coco_eval['segm'].stats[0])) print('Latency: %.3f ms' % latency) print("Throughput: {:.3f} fps".format(perf))
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None if args.mkldnn: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) if args.cached_weights: for batch in tqdm(eval_dataloader, desc="Tracing"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): # inputs = {'input_ids': batch[0], # 'attention_mask': batch[1], # 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} # traced = torch.jit.trace(model, **inputs, check_trace=False) traced = torch.jit.trace(model, (batch[0], batch[2], batch[1], batch[3]), check_trace=False) script = traced.save('jit_model.pt') break model = torch.jit.load('jit_model.pt') total_time = 0 num = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3]} t0 = time.time() if args.cached_weights: outputs = model(batch[0], batch[2], batch[1], batch[3]) else: outputs = model(**inputs) if num > 50: total_time += time.time() - t0 num += 1 tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) print('{} batch/s'.format((num-50)/total_time)) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: print(os.environ["RANK"]) args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu print("args.dist_backend {}".format(args.dist_backend)) print("args.dist_url {}".format(args.dist_url)) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: if use_gpu: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set if use_gpu: print("create DistributedDataParallel") model = torch.nn.parallel.DistributedDataParallel(model) else: model.cpu() print("create DistributedDataParallelCPU") model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) # define loss function (criterion) and optimizer if use_gpu: criterion = nn.CrossEntropyLoss().cuda(args.gpu) else: criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # support mkldnn if (args.mkldnn and not args.cuda): print("##############mkldnn##############") model = mkldnn_utils.to_mkldnn(model) optimizer_util.to_mkldnn(optimizer) print("using mkldnn model\n") if use_gpu: cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch print("run epoch '{}'".format(epoch)) train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if (args.rank == 0): if (args.mkldnn and not args.cuda): model = mkldnn_utils.to_dense(model) optimizer_util.to_dense(optimizer) print("#################save#################") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best) if (args.mkldnn and not args.cuda): model = mkldnn_utils.to_mkldnn(model) optimizer_util.to_mkldnn(optimizer)
def main(args): if args.ipex: import intel_pytorch_extension as ipex if args.fp16: ipex.enable_auto_mixed_precision(mixed_dtype=torch.bfloat16) use_amp = False if not args.no_cuda and torch.cuda.is_available(): device = torch.device('cuda') if args.fp16: use_amp = True elif args.ipex: device = ipex.DEVICE else: device = torch.device('cpu') log('Using PyTorch version: %s, Device: %s' % (torch.__version__, device)) log(torch.__config__.show()) cudnn.benchmark = True # Set up standard model. log('Initializing %s model...' % args.model) model = getattr(models, args.model)() model = model.to(device) if args.multi_gpu and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) log('Using %d GPUs with torch.nn.DataParallel' % torch.cuda.device_count()) if args.mkldnn: model = mkldnn_utils.to_mkldnn(model) optimizer = optim.SGD(model.parameters(), lr=0.01) scaler = torch.cuda.amp.GradScaler(enabled=use_amp) imsize = 224 if args.model == 'inception_v3': imsize = 299 def benchmark_step(): #data, target = next(iter(loader)) data = torch.randn(args.batch_size, 3, imsize, imsize) target = torch.LongTensor(args.batch_size).random_() % 1000 if args.mkldnn: data = data.to_mkldnn() data = data.to(device) target = target.to(device) optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=use_amp): output = model(data) if args.mkldnn: output = output.to_dense() if args.model == 'inception_v3': loss = F.cross_entropy(output.logits, target) else: loss = F.cross_entropy(output, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec' % (x, img_sec)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Total img/sec %.1f +-%.1f' % (img_sec_mean, img_sec_conf))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list;" + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument("--mkldnn_eval", action='store_true', help="evaluation with MKLDNN") parser.add_argument("--mkldnn_train", action='store_true', help="training with MKLDNN") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument("--do_calibration", action='store_true', help="Whether to do calibration.") parser.add_argument("--do_int8_inference", action='store_true', help="Whether to run int8 inference.") parser.add_argument("--do_bf16", action='store_true', help="run bf16 evaluation / training.") parser.add_argument("--tune", action='store_true', help="run ilit to tune int8 acc.") parser.add_argument("--warmup", type=int, default=2, help="warmup for performance") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) mix_qkv = False if args.do_calibration or args.do_int8_inference or args.tune: mix_qkv = True # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mix_qkv=mix_qkv, bf16=args.do_bf16, mkldnn_train=args.mkldnn_train, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" logger.info("Evaluate:" + args.task_name) if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16: model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_ilit(model): result, perf = evaluate(args, model, tokenizer, prefix=prefix) bert_task_acc_keys = [ 'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc' ] for key in bert_task_acc_keys: if key in result.keys(): logger.info("Finally Eval {}:{}".format( key, result[key])) acc = result[key] break return acc model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) eval_task_names = ( "mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) import ilit tuner = ilit.Tuner("./conf.yaml") if eval_task != "squad": eval_task = 'classifier' eval_dataset = tuner.dataset('bert', dataset=eval_dataset, task=eval_task) test_dataloader = tuner.dataloader( eval_dataset, batch_size=args.eval_batch_size) tuner.tune(model, test_dataloader, eval_func=eval_func_for_ilit) exit(0) if args.do_calibration: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) result, _ = evaluate(args, model, tokenizer, prefix=global_step, calibration=True) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) model.save_pretrained(quantized_model_path) print(model) result, _ = evaluate(args, model, tokenizer, prefix=prefix) if args.do_int8_inference: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): logger.error( "please do calibrantion befor run int8 inference") return prepare(model, inplace=True) convert(model, inplace=True) model_bin_file = os.path.join(quantized_model_path, "pytorch_model.bin") state_dict = torch.load(model_bin_file) model.load_state_dict(state_dict) result, _ = evaluate(args, model, tokenizer, prefix=prefix) return results
def evaluate(args, model, tokenizer, prefix="", calibration=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) results = {} for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if calibration: args.eval_batch_size = 16 else: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(eval_dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) import timeit total_time = 0.0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids if nb_eval_steps >= args.warmup: start = timeit.default_timer() outputs = model(**inputs) if nb_eval_steps >= args.warmup: total_time += (timeit.default_timer() - start) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if args.do_bf16: if preds is None: preds = logits.detach().cpu().to(torch.float).numpy() out_label_ids = inputs['labels'].detach().cpu().to( torch.float).numpy() else: preds = np.append(preds, logits.detach().cpu().to( torch.float).numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().to( torch.float).numpy(), axis=0) else: if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if nb_eval_steps >= args.warmup: perf = (len(eval_dataloader) - args.warmup) * args.eval_batch_size / total_time logger.info("***** perfformance {} samples/s *****".format(perf)) else: logger.info( "*****no perfformance, please check dataset length and warmup number *****" ) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return results, perf
def main(): args = parser.parse_args() print(args) args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda and args.mkldnn: assert False, "We can not runing this work on GPU backend and MKLDNN backend \ please set one backend.\n" if args.cuda: print("Using GPU backend to do this work.\n") elif args.mkldnn: print("Using MKLDNN backend to do this work.\n") else: print("Using native CPU backend to do this work.\n") # set it to the folder where video files are saved video_dir = args.video_dir + "/UCF-101" # set it to the folder where dataset splitting files are saved splits_dir = args.video_dir + "/ucfTrainTestlist" # set it to the file path for saving the metadata metadata_file = args.video_dir + "/metadata.pth" resnext3d_configs =model_config.ResNeXt3D_Config(video_dir, splits_dir, metadata_file, args.num_epochs) resnext3d_configs.setUp() datasets = {} dataset_train_configs = resnext3d_configs.dataset_configs["train"] dataset_test_configs = resnext3d_configs.dataset_configs["test"] dataset_train_configs["batchsize_per_replica"] = args.batch_size_train # For testing, batchsize per replica should be equal to clips_per_video dataset_test_configs["batchsize_per_replica"] = args.batch_size_eval dataset_test_configs["clips_per_video"] = args.batch_size_eval datasets["train"] = build_dataset(dataset_train_configs) datasets["test"] = build_dataset(dataset_test_configs) model = build_model(resnext3d_configs.model_configs) meters = build_meters(resnext3d_configs.meters_configs) loss = build_loss({"name": "CrossEntropyLoss"}) optimizer = build_optimizer(resnext3d_configs.optimizer_configs) # there some ops are not supported by MKLDNN, so convert input to CPU tensor if args.mkldnn: heads_configs = resnext3d_configs.model_configs['heads'][0] in_plane = heads_configs['in_plane'] num_classes = heads_configs['num_classes'] act_func = heads_configs['activation_func'] mkldnn_head_fcl = MkldnnFullyConvolutionalLinear(in_plane, num_classes, act_func) if args.evaluate: model = model.eval() model = mkldnn_utils.to_mkldnn(model) model._heads['pathway0-stage4-block2']['default_head'].head_fcl = mkldnn_head_fcl.eval() else: model._heads['pathway0-stage4-block2']['default_head'].head_fcl = mkldnn_head_fc # print(model) if args.evaluate: validata(datasets, model, loss, meters, args) return train(datasets, model, loss, optimizer, meters, args)