class CPUUnaryBench(Benchmark): args = utils.grid( _common_arg( {"function": TORCH_ONLY_FUNCTIONS, "framework": ("Torch",)} ) ) def setupRun(self, state, arg): _setupRun(self, state, arg) def benchmark(self, state, arg): _benchmark(self, state, arg)
class NumpyComparison(Benchmark): args = utils.grid( _common_arg( {"function": ALL_UNARY_FUNCTIONS, "framework": ("Torch", "NumPy")} ) ) def setupRun(self, state, arg): _setupRun(self, state, arg) def benchmark(self, state, arg): _benchmark(self, state, arg)
class CPUUnaryBench(Benchmark): args = utils.grid( _common_arg({ "function": TORCH_ONLY_FUNCTIONS, "framework": ("Torch", ) })) user_counters = {"sizes": 30 * " ", "strides": 30 * " "} def setupRun(self, state, arg): _setupRun(self, state, arg) def benchmark(self, state, arg): _benchmark(self, state, arg)
class NumpyUnaryComparison(Benchmark): args = utils.grid( _common_arg({ "function": ALL_UNARY_FUNCTIONS, "framework": ("Torch", "NumPy") })) user_counters = {"sizes": 30 * " ", "strides": 30 * " "} def setupRun(self, state, arg): _setupRun(self, state, arg) def benchmark(self, state, arg): _benchmark(self, state, arg)
class NumpyReduceComparison(Benchmark): # NB: NumPy doesn't parallelize it's reductions args = utils.grid({ "dims": ((3, None), (3, 2), (3, 1), (3, 0)), "mag": (6, ), "cont": (False, True), "trans": (False, True), "dtype": (torch.float, ), "function": ALL_REDUCE_FUNCTIONS, "framework": ("Torch", ), # "NumPy"), }) user_counters = {"shape": 10 * " "} def _benchmark(self, state, arg): if arg.framework == "Torch": if arg.dims[1]: getattr(torch, arg.function[0])(state.torch_tensor, arg.dims[1], out=state.output) else: getattr(torch, arg.function[0])(state.torch_tensor) else: if arg.dims[1]: getattr(np, arg.function[1])(state.numpy_tensor, axis=arg.dims[1], out=state.output) else: getattr(np, arg.function[1])(state.numpy_tensor) def setupRun(self, state, arg): size_ = int(math.pow(10, arg.mag)) tv = make_tensor(size_, arg.dtype, arg.cont, arg.dims[0], arg.trans) state.shape = str(tv.size()) state.torch_tensor = tv state.output = None if arg.framework == "NumPy": if arg.dtype == torch.float: state.numpy_tensor = state.torch_tensor.numpy() assert state.numpy_tensor.dtype == np.float32 if arg.dtype == torch.double: state.numpy_tensor = state.torch_tensor.numpy() assert state.numpy_tensor.dtype == np.float64 self._benchmark(state, arg) def benchmark(self, state, arg): self._benchmark(state, arg)
class CPULSTMBench(Benchmark): sizes = [ [64, 15, 500, 500], [64, 20, 500, 500], [64, 25, 500, 500], [64, 30, 500, 500], [64, 35, 500, 500], [64, 40, 500, 500], [64, 45, 500, 500], [64, 50, 500, 500], [16, 25, 512, 512], [32, 25, 512, 512], [64, 25, 512, 512], [128, 25, 512, 512], [16, 25, 1024, 1024], [32, 25, 1024, 1024], [64, 25, 1024, 1024], [128, 25, 1024, 1024], [16, 25, 2048, 2048], [32, 25, 2048, 2048], [64, 25, 2048, 2048], [128, 25, 2048, 2048], [16, 25, 4096, 4096], [32, 25, 4096, 4096], [64, 25, 4096, 4096], [128, 25, 4096, 4096], ] args = utils.grid({"size": sizes, "train": (True, False)}) user_counters = { "duration": 0, "gflops": 10 * " ", "GFLOPS": 10 * " ", "SPS": 10 * " ", } def setupRun(self, state, arg): size = arg.size N = size[0] # batch size T = size[1] # sentence length D = size[2] # embedding size H = size[3] # hidden size state.N, state.T, state.D, state.H = N, T, D, H state.rnn = nn.LSTM(D, H, 1) state.input = Variable(torch.randn(T, N, D)) state.h0 = Variable(torch.randn(1, N, H)) state.c0 = Variable(torch.randn(1, N, H)) state.output, state.hn = state.rnn(state.input, (state.h0, state.c0)) if arg.train: state.loss_fn = torch.nn.L1Loss() state.targets = Variable(torch.randn(T, N, D)) state.num_iter = 0 state.elapsed = 0 def benchmark(self, state, arg): start = time.time() state.output, state.hn = state.rnn(state.input, (state.h0, state.c0)) if arg.train: loss = state.loss_fn(state.output, state.targets) loss.backward() state.elapsed += time.time() - start state.num_iter += 1 def teardownRun(self, state, arg): dura = (state.elapsed) / state.num_iter # time of ONE iteration N, T, D, H = state.N, state.T, state.D, state.H gflops = T * 4 * (N * H * D * 2 + N * H * H * 2) / 1e9 GFLOPS = gflops / dura # giga floating-point operations per second SPS = N / dura # number of processed sentences per second state.duration = "{:.4f}".format(dura) state.gflops = "{:.4f}".format(gflops) state.GFLOPS = "{:.4f}".format(GFLOPS) state.SPS = "{:.4f}".format(SPS)
class CPUConvnets(Benchmark): args = utils.grid({ ("arch", "size"): ( ("alexnet", (128, 3, 224, 224)), ("vgg11", (64, 3, 224, 224)), ("inception_v3", (128, 3, 299, 299)), ("resnet50", (128, 3, 224, 224)), ("squeezenet1_0", (128, 3, 224, 224)), ("densenet121", (32, 3, 224, 224)), # ("mobilenet_v2", (128, 3, 224, 224)), ), "single_batch_size": (True, False), "inference": (True, False), }) user_counters = { "time_fwd_avg": 0, "time_bwd_avg": 0, "time_upt_avg": 0, "time_total": 0, } def setupRun(self, state, arg): arch, sizes = arg[("arch", "size")] batch_size, c, h, w = sizes[0], sizes[1], sizes[2], sizes[3] batch_size = 1 if arg.single_batch_size else batch_size data_ = torch.randn(batch_size, c, h, w) target_ = torch.arange(1, batch_size + 1).long() state.net = models.__dict__[arch]( ) # no need to load pre-trained weights for dummy data state.optimizer = optim.SGD(state.net.parameters(), lr=0.01) state.criterion = nn.CrossEntropyLoss() state.net.eval() state.data, state.target = Variable(data_), Variable(target_) state.steps = 0 state.time_fwd = 0 state.time_bwd = 0 state.time_upt = 0 def benchmark(self, state, arg): state.optimizer.zero_grad() # zero the gradient buffers t1 = time.time() output = state.net(state.data) t2 = time.time() if not arg.inference: loss = state.criterion(output, state.target) loss.backward() t3 = time.time() state.optimizer.step() # Does the update t4 = time.time() state.time_fwd += t2 - t1 if not arg.inference: state.time_bwd += t3 - t2 state.time_upt += t4 - t3 state.steps += 1 def teardownRun(self, state, arg): time_fwd_avg = state.time_fwd / state.steps * 1000 time_bwd_avg = state.time_bwd / state.steps * 1000 time_upt_avg = state.time_upt / state.steps * 1000 # update not included! time_total = time_fwd_avg + time_bwd_avg state.time_fwd_avg = "{:2.3f}".format(time_fwd_avg) state.time_bwd_avg = "{:2.3f}".format(time_bwd_avg) state.time_upt_avg = "{:2.3f}".format(time_upt_avg) state.time_total = "{:2.3f}".format(time_total)