def test_timerit_verbose(): from xdoctest.utils import CaptureStdout with CaptureStdout() as cap: ub.Timerit(3, label='foo', verbose=0).call(lambda: None) assert cap.text == '' with CaptureStdout() as cap: ub.Timerit(3, label='foo', verbose=1).call(lambda: None) assert cap.text.count('\n') == 2 assert cap.text.count('foo') == 1 with CaptureStdout() as cap: ub.Timerit(3, label='foo', verbose=2).call(lambda: None) assert cap.text.count('\n') == 3 assert cap.text.count('foo') == 2 with CaptureStdout() as cap: ub.Timerit(3, label='foo', verbose=3).call(lambda: None) assert cap.text.count('\n') == 4 assert cap.text.count('foo') == 2 with CaptureStdout() as cap: ub.Timerit(3, label='foo', verbose=4).call(lambda: None) assert cap.text.count('\n') == 4 assert cap.text.count('foo') == 2
def _benchmark_focal_loss(): import ubelt as ub import torch.nn.functional as F import netharn as nh B, C = 16, 37 DIMS = (128, 128) dim = 1 inputs = torch.rand(B, C, *DIMS) inputs.requires_grad = True log_probs = F.log_softmax(inputs, dim=dim) targets = (torch.rand(B, *DIMS) * C).long() # ti = ub.Timerit(20, bestof=3, verbose=1, unit='us') # devices = [ nh.XPU.coerce('cuda0'), nh.XPU.coerce('cpu'), ] # # Forward for xpu in devices: log_probs = xpu.move(log_probs) targets = xpu.move(targets) print(' --- FORWARD ---') print('\n\n--- xpu = {!r} ---\n'.format(xpu)) for timer in ti.reset('F.nll_loss'): with timer: loss1 = F.nll_loss(log_probs, targets, reduction='none') torch.cuda.synchronize() for timer in ti.reset('nll_focal_loss(focus=0)'): with timer: loss2 = nll_focal_loss(log_probs, targets, focus=0, dim=dim) torch.cuda.synchronize() for timer in ti.reset('nll_focal_loss(focus=2)'): with timer: loss3 = nll_focal_loss(log_probs, targets, focus=2, dim=dim) torch.cuda.synchronize() # # Backward ti = ub.Timerit(5, bestof=1, verbose=1, unit='ms') log_probs = F.log_softmax(inputs, dim=dim) for xpu in devices: print(' --- BACKWARD ---') print('\n\n--- xpu = {!r} ---\n'.format(xpu)) for timer in ti.reset('F.nll_loss'): with timer: loss1 = F.nll_loss(log_probs, targets, reduction='none') loss1.mean().backward(retain_graph=True) torch.cuda.synchronize() for timer in ti.reset('nll_focal_loss(focus=0)'): with timer: loss2 = nll_focal_loss(log_probs, targets, focus=0.0, dim=dim) loss2.mean().backward(retain_graph=True) torch.cuda.synchronize() for timer in ti.reset('nll_focal_loss(focus=2)'): with timer: loss3 = nll_focal_loss(log_probs, targets, focus=2.0, dim=dim) loss3.mean().backward(retain_graph=True) torch.cuda.synchronize()
def compare_loss_speed(): """ python ~/code/netharn/netharn/models/yolo2/light_region_loss.py compare_loss_speed Example: >>> compare_loss_speed() """ from netharn.models.yolo2.light_yolo import Yolo import netharn.models.yolo2.light_region_loss import lightnet.network import ubelt as ub torch.random.manual_seed(0) network = Yolo(num_classes=2, conf_thresh=4e-2) self1 = netharn.models.yolo2.light_region_loss.RegionLoss( num_classes=network.num_classes, anchors=network.anchors) self2 = lightnet.network.RegionLoss(num_classes=network.num_classes, anchors=network.anchors) # Win, Hin = 416, 416 Win, Hin = 96, 96 # ----- More targets ----- rng = util.ensure_rng(0) import netharn as nh bsize = 4 # Make a random semi-realistic set of groundtruth items n_targets = [rng.randint(0, 10) for _ in range(bsize)] target_list = [ torch.FloatTensor( np.hstack([ rng.randint(0, network.num_classes, nT)[:, None], util.Boxes.random(nT, scale=1.0, rng=rng).data ])) for nT in n_targets ] target = nh.data.collate.padded_collate(target_list) im_data = torch.randn(len(target), 3, Hin, Win) output = network.forward(im_data) self1.iou_mode = 'c' for timer in ub.Timerit(100, bestof=10, label='cython_ious'): with timer: loss_cy = float(self1(output, target)) self1.iou_mode = 'py' for timer in ub.Timerit(100, bestof=10, label='python_ious'): with timer: loss_py = float(self1(output, target)) for timer in ub.Timerit(100, bestof=10, label='original'): with timer: loss_orig = float(self2(output, target)) print('loss_cy = {!r}'.format(loss_cy)) print('loss_py = {!r}'.format(loss_py)) print('loss_orig = {!r}'.format(loss_orig))
def test_timerit_default_verbosity(): from xdoctest.utils import CaptureStdout with CaptureStdout() as cap: ub.Timerit(10, '').call(lambda: None) assert cap.text == '', 'should be quiet by default when label is not given' with CaptureStdout() as cap: ub.Timerit(10, 'alabel').call(lambda: None) assert cap.text != '', 'should be verbose by default when label is given'
def benchmark_alphablend_impls(): """ Ignore: from kwimage.imutil.im_alphablend import * """ from kwimage.im_alphablend import overlay_alpha_images from kwimage.im_alphablend import _prep_rgb_alpha from kwimage.im_alphablend import _alpha_blend_inplace from kwimage.im_alphablend import _alpha_blend_simple from kwimage.im_alphablend import _alpha_blend_numexpr1 from kwimage.im_alphablend import _alpha_blend_numexpr2 import kwimage import xdev import ubelt as ub H = W = 32 rng = np.random.RandomState(0) rgb1, rgb2 = rng.rand(H, W, 3), rng.rand(H, W, 3) alpha1, alpha2 = rng.rand(H, W), rng.rand(H, W) dtype = np.float32 # dtype = np.float64 rgb1 = rgb1.astype(dtype) rgb2 = rgb2.astype(dtype) alpha1 = alpha1.astype(dtype) alpha2 = alpha2.astype(dtype) # If num is set too low it may seem like simple beats inplace, but that is # actually not the case. inplace is slightly faster as expected. ti = ub.Timerit(num=2000, bestof=100, unit='us', verbose=1) ti.reset(label='simple').call(lambda: _alpha_blend_simple(rgb1, alpha1, rgb2, alpha2)) ti.reset(label='inplace').call(lambda: _alpha_blend_inplace(rgb1, alpha1, rgb2, alpha2)) ti.reset(label='numexpr1').call(lambda: _alpha_blend_numexpr1(rgb1, alpha1, rgb2, alpha2)) ti.reset(label='numexpr2').call(lambda: _alpha_blend_numexpr2(rgb1, alpha1, rgb2, alpha2)) # It looks like the simple algorithm is winning ATM ub.Timerit(label='inplace', unit='us').call( lambda: overlay_alpha_images(rgb1, rgb2, impl='inplace')) ub.Timerit(label='simple', unit='us').call( lambda: overlay_alpha_images(rgb1, rgb2, impl='simple')) _ = xdev.profile_now(overlay_alpha_images)(rgb1, rgb2, impl='simple') _ = xdev.profile_now(overlay_alpha_images)(rgb1, rgb2, impl='inplace') _ = xdev.profile_now(kwimage.ensure_float01)(rgb1) _ = xdev.profile_now(_prep_rgb_alpha)(rgb1) _ = xdev.profile_now(_prep_rgb_alpha)(rgb2) _ = xdev.profile_now(_alpha_blend_simple)(rgb1, alpha1, rgb2, alpha2) _ = xdev.profile_now(_alpha_blend_inplace)(rgb1, alpha1, rgb2, alpha2) _ = xdev.profile_now(_alpha_blend_numexpr1)(rgb1, alpha1, rgb2, alpha2) _ # NOQA
def bench_platform_test(): """ This is textbook premature optimization, but I was curious Results: Timed best=477.768 ns, mean=709.491 ± 128.4 ns for == win32 Timed best=585.802 ns, mean=864.347 ± 191.0 ns for startswith(win32) Timed best=494.998 ns, mean=771.782 ± 135.9 ns for == linux Timed best=592.787 ns, mean=933.651 ± 177.2 ns for startswith(linux) """ import ubelt as ub import sys ti = ub.Timerit(10000, bestof=100, verbose=1, unit='ns') for timer in ti.reset('== win32'): with timer: sys.platform == 'win32' for timer in ti.reset('startswith(win32)'): with timer: sys.platform.startswith('win32') for timer in ti.reset('== linux'): with timer: sys.platform == 'linux' for timer in ti.reset('startswith(linux)'): with timer: sys.platform.startswith('linux')
def bench_typecheck(): import numpy as np datas = { 'u': np.random.rand(10).astype(np.uint32), 'i': np.random.rand(10).astype(np.int32), 'f': np.random.rand(10).astype(np.float32), } import ubelt as ub ti = ub.Timerit(50000, bestof=200, label='time') for timer in ti.reset('in list'): with timer: for data in datas.values(): data.dtype.kind in ['i', 'u'] for timer in ti.reset('in set'): with timer: for data in datas.values(): data.dtype.kind in {'i', 'u'} for timer in ti.reset('in tuple'): with timer: for data in datas.values(): data.dtype.kind in ('i', 'u') for timer in ti.reset('two =='): with timer: for data in datas.values(): data.dtype.kind == 'i' or data.dtype.kind == 'u'
def bench_imread(): import ubelt as ub # fpath = ub.grabdata('http://www.topcoder.com/contest/problem/UrbanMapper3D/JAX_Tile_043_DTM.tif') import kwimage fpath = kwimage.grab_test_image_fpath('airport') # A color-table geotiff # https://download.osgeo.org/geotiff/samples/ # fpath = ub.grabdata('https://download.osgeo.org/geotiff/samples/usgs/c41078a1.tif') ti = ub.Timerit(100, bestof=5, verbose=2) results = {} fpath = '/home/joncrall/data/sample_ptif.ptif' fpath = '/home/joncrall/data/sample_cog.cog.tif' for timer in ti.reset('gdal-v1'): with timer: image = _read_gdal_v1(fpath) results[ti.label] = image.sum() for timer in ti.reset('gdal-v2'): with timer: image = _read_gdal_v2(fpath) results[ti.label] = image.sum() for timer in ti.reset('rasterio'): with timer: image = _read_rasterio(fpath) results[ti.label] = image.sum() import skimage.io """ pip install tifffile pip install imagecodecs """ for timer in ti.reset('skimage'): with timer: image = skimage.io.imread(fpath) results[ti.label] = image.sum() import kwimage for timer in ti.reset('kwimage'): with timer: image = kwimage.imread(fpath) results[ti.label] = image.sum() import cv2 for timer in ti.reset('cv2'): with timer: image = cv2.imread(fpath) results[ti.label] = image.sum() for timer in ti.reset('pil'): with timer: image = _read_pil(fpath) results[ti.label] = image.sum()
def benchmark_region_loss(): """ CommandLine: python ~/code/netharn/netharn/models/yolo2/light_region_loss.py benchmark_region_loss --profile Benchmark: >>> benchmark_region_loss() """ from netharn.models.yolo2.light_yolo import Yolo torch.random.manual_seed(0) network = Yolo(num_classes=2, conf_thresh=4e-2) self = light_region_loss.RegionLoss(num_classes=network.num_classes, anchors=network.anchors) Win, Hin = 96, 96 # true boxes for each item in the batch # each box encodes class, center, width, and height # coordinates are normalized in the range 0 to 1 # items in each batch are padded with dummy boxes with class_id=-1 target = torch.FloatTensor([ # boxes for batch item 1 [[0, 0.50, 0.50, 1.00, 1.00], [1, 0.32, 0.42, 0.22, 0.12]], # boxes for batch item 2 (it has no objects, note the pad!) [[-1, 0, 0, 0, 0], [-1, 0, 0, 0, 0]], ]) im_data = torch.randn(len(target), 3, Hin, Win) output = network.forward(im_data) import ubelt for timer in ubelt.Timerit(250, bestof=10, label='time'): with timer: loss = float(self.forward(output, target)) print('loss = {!r}'.format(loss))
def bench_sort_dictionary(): """ CommandLine: xdoctest -m ~/code/ubelt/dev/bench_dict_hist.py bench_sort_dictionary Results: Timed best=25.484 µs, mean=25.701 ± 0.1 µs for itemgetter Timed best=28.810 µs, mean=29.138 ± 0.3 µs for lambda """ import operator as op import ubelt as ub import random import string rng = random.Random(0) items = [rng.choice(string.printable) for _ in range(5000)] hist_ = ub.ddict(lambda: 0) for item in items: hist_[item] += 1 ti = ub.Timerit(1000, bestof=10, verbose=1) for timer in ti.reset('itemgetter'): with timer: # WINNER getval = op.itemgetter(1) key_order = [key for (key, value) in sorted(hist_.items(), key=getval)] for timer in ti.reset('lambda'): with timer: key_order = [key for (key, value) in sorted(hist_.items(), key=lambda x: x[1])] del key_order
def bench_isinstance_vs_attr(): instances = { 'base1': Base1(), 'base2': Base2(), 'derived2': Derived2(), } import ubelt as ub ti = ub.Timerit(100000, bestof=500, verbose=1, unit='us') # Do this twice, but keep the second measure data = ub.AutoDict() for selfname, self in instances.items(): print(ub.color_text('--- SELF = {} ---'.format(selfname), 'blue')) subdata = data[selfname] = {} for timer in ti.reset('isinstance(self, Base1)'): with timer: isinstance(self, Base1) subdata[ti.label] = ti.min() for timer in ti.reset('isinstance(self, Base2)'): with timer: isinstance(self, Base2) subdata[ti.label] = ti.min() for timer in ti.reset('isinstance(self, Derived2)'): with timer: isinstance(self, Derived2) subdata[ti.label] = ti.min() for timer in ti.reset('getattr(self, "class_attr1", False)'): with timer: getattr(self, 'class_attr1', False) subdata[ti.label] = ti.min() for timer in ti.reset('getattr(self, "attr1", False)'): with timer: getattr(self, 'attr1', False) subdata[ti.label] = ti.min() try: import pandas as pd df = pd.DataFrame(data) * 1e9 try: from kwil.util.util_pandas import _to_string_monkey print(_to_string_monkey(df, key='minima')) except Exception: print(df) except ImportError: print('no pandas') print(ub.repr2(data, nl=2, precision=4))
def test_disk_io(self): # Function that runs different strategies key = self.name() print('\n# --- {} ---'.format(key)) write_ti = ub.Timerit(N_ITERS, label='{} write time'.format(self.ext)) for timer in write_ti: with timer: self.write() read_ti = ub.Timerit(N_ITERS, label='{} read time'.format(self.ext)) for timer in read_ti: with timer: for _ in self.read(): pass record.loc[key, 'write'] = write_ti.ave_secs record.loc[key, 'read'] = read_ti.ave_secs record.loc[key, 'MB'] = self.n_bytes() * 1e-6 print('\n') print(record) print('-------')
def _benchmark(): """ On 64-bit processors sha512 may be faster than sha256 References: https://crypto.stackexchange.com/questions/26336/sha512-faster-than-sha256 """ result = ub.AutoOrderedDict() algos = ['sha1', 'sha256', 'sha512'] for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'): # for key in hashlib.algorithms_guaranteed: for key in algos: hashtype = _rectify_hasher(key) t1 = ub.Timerit(100, bestof=10, label=key, verbose=0) for timer in t1: data = b'8' * n with timer: hasher = hashtype() hasher.update(data) result[key][n] = t1.min() import pandas as pd print(pd.DataFrame(result)) result = ub.AutoOrderedDict() for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'): # for key in hashlib.algorithms_guaranteed: for key in algos: hashtype = _rectify_hasher(key) t1 = ub.Timerit(100, bestof=10, label=key, verbose=0) for timer in t1: data = b'8' * n hasher = hashtype() hasher.update(data) with timer: hasher.hexdigest() result[key][n] = t1.min() import pandas as pd print(pd.DataFrame(result)) """
def benchmark_write(xxd, proto): dpath = dpaths[xxd] args = {'proto': proto, 'xxd': xxd} fpath = join(dpath, 'test_{}.pkl'.format(proto)) for timer in ub.Timerit(10, label='save {}'.format(args)): ub.delete(fpath) ub.writeto(fpath, 'junkdata') ub.delete(fpath) with timer: with open(fpath, 'wb') as file: pickle.dump(data, file, protocol=proto) result = args.copy() result['write_time'] = timer.ellapsed for timer in ub.Timerit(10, label='read {}'.format(args)): with timer: with open(fpath, 'rb') as file: pickle.load(file) result['read_time'] = timer.ellapsed return result
def bench_closures(): """ Is it faster to use a closure or pass in the variables explicitly? """ import ubelt as ub import numpy as np # Test a nested func with vs without a closure def rand_complex(*shape): real = np.random.rand(*shape).astype(np.complex) imag = np.random.rand(*shape).astype(np.complex) * 1j mat = real + imag return mat s = int(ub.argval('--s', default='1')) mat1 = rand_complex(s, s) mat2 = rand_complex(s, s) N = 1000 offset = 100 def nested_closure(): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset def nested_explicit(mat1, mat2, N, offset): mat3 = mat1 @ mat2 for i in range(N): mat3 += i + offset ti = ub.Timerit(int(2 ** 11), bestof=int(2 ** 8), verbose=int(ub.argval('--verbose', default='1'))) for timer in ti.reset('nested_explicit'): with timer: nested_explicit(mat1, mat2, N, offset) for timer in ti.reset('nested_closure'): with timer: nested_closure() print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def bench_dict_hist(): import operator as op import ubelt as ub import random import string rng = random.Random(0) items = [rng.choice(string.printable) for _ in range(5000)] hist_ = ub.ddict(lambda: 0) for item in items: hist_[item] += 1 OrderedDict = ub.odict ti = ub.Timerit(1000, bestof=10, verbose=2) for timer in ti.reset('time'): with timer: getval = op.itemgetter(1) key_order = (key for (key, value) in sorted(hist_.items(), key=getval)) hist = ub.dict_subset(hist_, key_order) for timer in ti.reset('time'): with timer: getval = op.itemgetter(1) key_order = [key for (key, value) in sorted(hist_.items(), key=getval)] hist = ub.dict_subset(hist_, key_order) for timer in ti.reset('itemgetter'): with timer: # WINNER getval = op.itemgetter(1) hist = OrderedDict([ (key, value) for (key, value) in sorted(hist_.items(), key=getval) ]) # ----------------- for timer in ti.reset('itemgetter'): with timer: # WINNER getval = op.itemgetter(1) key_order = [key for (key, value) in sorted(hist_.items(), key=getval)] for timer in ti.reset('lambda'): with timer: key_order = [key for (key, value) in sorted(hist_.items(), key=lambda x: x[1])]
def test_imresize_multi_channel(): """ Test that imresize works with multiple channels in various configurations """ resize_kw_basis = { 'dsize': [(10, 10), (60, 60)], 'interpolation': ['area', 'linear', 'cubic', 'nearest'] } image_basis = { 'dims': [(32, 32), (37, 41), (53, 31)], 'channels': [None, 1, 3, 4, 20, 1024], 'dtype': ['uint8', 'float32', 'float64'], } resize_kw_list = list(basis_product(resize_kw_basis)) failures = [] success = [] ti = ub.Timerit(1, bestof=1, verbose=1) for imgkw, img in image_variations(image_basis): for resize_kw in resize_kw_list: params = dict(resize_kw=resize_kw, imgkw=imgkw) try: label = ub.repr2(params, nl=0, nobr=True, si=1, sv=1, kvsep='=', itemsep='') for timer in ti.reset(label): with timer: kwimage.imresize(img, **resize_kw) except Exception: failures.append(label) print('FAILED = {!r}'.format(label)) raise else: success.append(label) print('n_pass = {}'.format(len(success))) print('n_fail = {}'.format(len(failures))) print('failures = {}'.format(ub.repr2(failures, nl=1))) print('ti.rankings = {}'.format( ub.repr2(ti.rankings, nl=2, align=':', precision=6)))
def benchmark_all_mask_conversions(): import kwimage import ubelt as ub base_mask = kwimage.Mask.random(shape=(256, 256)) ti = ub.Timerit(1000, bestof=50, verbose=1, unit='us') from kwimage.structs.mask import MaskFormat # NOQA for format1 in MaskFormat.cannonical: print('--- {} ---'.format(format1)) mask1 = base_mask.toformat(format1) for format2 in MaskFormat.cannonical: for timer in ti.reset('{} -> {}'.format(format1, format2)): with timer: mask1.toformat(format2)
def bench_dict_hist(): """ CommandLine: xdoctest -m ~/code/ubelt/dev/bench_dict_hist.py bench_dict_hist Results: Timed best=48.330 µs, mean=49.437 ± 1.0 µs for dict_subset_iter Timed best=59.392 µs, mean=63.395 ± 11.9 µs for dict_subset_list Timed best=47.203 µs, mean=47.632 ± 0.2 µs for direct_itemgetter """ import operator as op import ubelt as ub import random import string rng = random.Random(0) items = [rng.choice(string.printable) for _ in range(5000)] hist_ = ub.ddict(lambda: 0) for item in items: hist_[item] += 1 OrderedDict = ub.odict ti = ub.Timerit(1000, bestof=10, verbose=1) for timer in ti.reset('dict_subset_iter'): with timer: getval = op.itemgetter(1) key_order = (key for (key, value) in sorted(hist_.items(), key=getval)) hist = ub.dict_subset(hist_, key_order) for timer in ti.reset('dict_subset_list'): with timer: getval = op.itemgetter(1) key_order = [key for (key, value) in sorted(hist_.items(), key=getval)] hist = ub.dict_subset(hist_, key_order) for timer in ti.reset('direct_itemgetter'): with timer: # WINNER getval = op.itemgetter(1) hist = OrderedDict([ (key, value) for (key, value) in sorted(hist_.items(), key=getval) ]) del hist
def test_startup_time(): import ubelt as ub cmdlist = [ '/usr/bin/python2', '/usr/bin/python3', # '/home/joncrall/venv3.6/bin/python3', '/home/joncrall/.local/conda/envs/py36/bin/python', ] for exe in cmdlist: print('\n==========\nexe = {!r}'.format(exe)) for opts in ['', '-B', '-S', '-OO']: args = ' -c "import argparse"' command = exe + ' ' + opts + args for timer in ub.Timerit(50, bestof=5, label=repr(command), verbose=1): with timer: ub.cmd(command)
def main(): import ubelt as ub from ubelt import util_list from ubelt.util_list import take import random from math import e # # Data N = 100 array = [random.random() for _ in range(N)] indices = [random.randint(0, N - 1) for _ in range(int(N // e))] ti = ub.Timerit(2 ** 11, bestof=2 ** 8, verbose=1) for timer in ti.reset('take'): with timer: list(take(array, indices)) for timer in ti.reset('util_list.take'): with timer: list(util_list.take(array, indices)) for timer in ti.reset('ub.take'): with timer: list(ub.take(array, indices)) print('---') # import pandas as pd # df = pd.DataFrame(rankings) # print('df =\n{}'.format(df)) print('rankings = {}'.format(ub.repr2(ti.rankings, precision=9, nl=2))) print('consistency = {}'.format(ub.repr2(ti.consistency, precision=9, nl=2))) positions = ub.ddict(list) for m1, v1 in ti.rankings.items(): for pos, label in enumerate(ub.argsort(v1), start=0): positions[label].append(pos) average_position = ub.map_vals(lambda x: sum(x) / len(x), positions) print('average_position = {}'.format(ub.repr2(average_position)))
def fastfill_multipolygon(): kwplot.autompl() shape = (1208, 1208) self = kwimage.MultiPolygon.random(10).scale(shape) ti = ub.Timerit(3, bestof=1, verbose=2, unit='us') for timer in ti.reset('draw_on'): with timer: mask = np.zeros(shape, dtype=np.uint8) mask = self.draw_on(mask) for timer in ti.reset('custom'): with timer: mask = np.zeros(shape, dtype=np.uint8) for p in self.data: if p is not None: p.fill(mask, value=255) for timer in ti.reset('to_mask'): with timer: self.to_mask(shape) kwplot.imshow(mask)
def bench_memoize(): import ubelt as ub @ub.memoize def memoized_func(): return object() def raw_func(): return object() class Foo(object): @ub.memoize_property def a_memoized_property(self): return object() @ub.memoize_method def a_memoized_method(self): return object() @property def a_raw_property(self): return object() def a_raw_method(self): return object() self = Foo() ti = ub.Timerit(1000, bestof=100, verbose=1, unit='ns') ti.reset('memoized method').call(lambda: self.a_memoized_method()) ti.reset('raw method').call(lambda: self.a_raw_method()) ti.reset('memoized func').call(lambda: memoized_func()) ti.reset('raw func').call(lambda: raw_func()) ti.reset('memoized property').call(lambda: self.a_memoized_property) ti.reset('raw property').call(lambda: self.a_raw_property)
def benchmark_select_rle_conversions(): """ Check what is the fastest way to encode an RLE """ import kwimage import ubelt as ub c_mask = kwimage.Mask.random(shape=(256, 256)) f_mask = c_mask.to_fortran_mask(copy=True) img = c_mask.data ti = ub.Timerit(1000, bestof=50, verbose=1) for timer in ti.reset('img -> encode_run_length(non-binary)'): with timer: kwimage.encode_run_length(img, binary=False) for timer in ti.reset('img -> encode_run_length(binary)'): with timer: kwimage.encode_run_length(img, binary=True) for timer in ti.reset('c_mask -> to_array_rle'): with timer: c_mask.to_array_rle() for timer in ti.reset('c_mask -> to_bytes_rle'): with timer: c_mask.to_bytes_rle() for timer in ti.reset('f_mask -> to_array_rle'): with timer: f_mask.to_array_rle() for timer in ti.reset('f_mask -> to_bytes_rle'): with timer: f_mask.to_bytes_rle()
def time_grid_create_methods(): import numpy as np import ubelt N = 10 K = 1001 for timer in ubelt.Timerit(100, bestof=10, label='hv-stack'): with timer: ns = np.hstack([np.arange(N)[:, None]] * K).ravel() ks = np.vstack([np.arange(K)[None, :]] * N).ravel() for timer in ubelt.Timerit(100, bestof=10, label='tile'): with timer: ns = np.tile(np.arange(N)[:, None], (1, K)).ravel() ks = np.tile(np.arange(K), (N, 1)).ravel() for timer in ubelt.Timerit(100, bestof=10, label='repeat+arange'): with timer: ns = np.repeat(np.arange(N), K, axis=0).ravel() ks = np.repeat(np.arange(K)[None, :], N, axis=0).ravel() for timer in ubelt.Timerit(100, bestof=10, label='mgrid'): with timer: ns, ks = np.mgrid[0:N, 0:K] ns = ns.ravel() ks = ks.ravel() for timer in ubelt.Timerit(100, bestof=10, label='meshgrid'): with timer: ks, ns = np.meshgrid(np.arange(K), np.arange(N)) ns = ns.ravel() ks = ks.ravel() for timer in ubelt.Timerit(100, bestof=10, label='ogrid+repeat'): with timer: ns_basis, ks_basis = np.ogrid[0:N, 0:K] ns = np.repeat(ns_basis, K, axis=0).ravel() ks = np.repeat(ks_basis, N, axis=0).ravel()
def benchmark_hash_data(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --convert=True --show python ~/code/ubelt/dev/bench_hash.py --convert=False --show """ import ubelt as ub #ITEM = 'JUST A STRING' * 100 ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4] HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 13)) results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 convert = ub.argval('--convert', default='True').lower() == 'True' print('convert = {!r}'.format(convert)) ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2**s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) data = [ITEM] * N for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_data(data, hasher=hasher, convert=convert) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print('convert = {!r}'.format(convert)) print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds', title='convert = {}'.format(convert)) kwplot.show_if_requested()
def benchmark_nms_version(): """ xdoctset netharn.models.yolo2.light_postproc benchmark_nms_version """ # Build random test boxes and scores from lightnet.data.transform._postprocess import NonMaxSupression import netharn as nh num = 16 * 16 * 5 rng = nh.util.ensure_rng(0) cpu_boxes = nh.util.Boxes.random(num, scale=416.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 20, len(cpu_tlbr))) # Format boxes in lightnet format cxywh_score_cls = torch.cat([cpu_boxes.to_cxywh().data, cpu_scores[:, None], cpu_cls.float()[:, None]], dim=-1) gpu = torch.device('cuda', 0) gpu_ln_boxes = cxywh_score_cls.to(gpu) thresh = .5 def _ln_output_to_keep(ln_output, ln_boxes): keep = [] for row in ln_output: # Find the index that we kept idxs = np.where(np.all(np.isclose(ln_boxes, row), axis=1))[0] assert len(idxs) == 1 keep.append(idxs[0]) assert np.all(np.isclose(ln_boxes[keep], ln_output)) return keep N = 12 bestof = 3 t1 = ub.Timerit(N, bestof=bestof, label='lightnet()') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=True, fast=False) torch.cuda.synchronize() ln_keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) anchors = np.array([(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053), (11.2364, 10.0071)]) self = GetBoundingBoxes(anchors=anchors, num_classes=20, conf_thresh=.01, nms_thresh=thresh) t1 = ub.Timerit(N, bestof=bestof, label='netharn(mode0)') for timer in t1: with timer: nh_output = self._nms(gpu_ln_boxes, nms_mode=0) torch.cuda.synchronize() nh_keep_0 = _ln_output_to_keep(nh_output, gpu_ln_boxes) t1 = ub.Timerit(N, bestof=bestof, label='netharn(mode1)') for timer in t1: with timer: nh_output = self._nms(gpu_ln_boxes, nms_mode=1) torch.cuda.synchronize() nh_keep_1 = _ln_output_to_keep(nh_output, gpu_ln_boxes) t1 = ub.Timerit(N, bestof=bestof, label='netharn(mode2)') for timer in t1: with timer: nh_output = self._nms(gpu_ln_boxes, nms_mode=2) torch.cuda.synchronize() nh_keep_2 = _ln_output_to_keep(nh_output, gpu_ln_boxes) t1 = ub.Timerit(N, bestof=bestof, label='netharn(mode3)') for timer in t1: with timer: nh_output = self._nms(gpu_ln_boxes, nms_mode=3) torch.cuda.synchronize() nh_keep_3 = _ln_output_to_keep(nh_output, gpu_ln_boxes) t1 = ub.Timerit(N, bestof=bestof, label='netharn(mode4)') for timer in t1: with timer: nh_output = self._nms(gpu_ln_boxes, nms_mode=4) torch.cuda.synchronize() nh_keep_4 = _ln_output_to_keep(nh_output, gpu_ln_boxes) nh_keep_0 == nh_keep_2 nh_keep_0 == nh_keep_3 print('ln_keep = {!r}'.format(len(ln_keep))) print('len(nh_keep_0) = {!r}'.format(len(nh_keep_0))) print('len(nh_keep_1) = {!r}'.format(len(nh_keep_1))) print('len(nh_keep_2) = {!r}'.format(len(nh_keep_2))) print('len(nh_keep_3) = {!r}'.format(len(nh_keep_3))) print('len(nh_keep_4) = {!r}'.format(len(nh_keep_4)))
def _benchmark(): """ python -m netharn.util.nms.torch_nms _benchmark --show SeeAlso: PJR Darknet NonMax supression https://github.com/pjreddie/darknet/blob/master/src/box.c Lightnet NMS https://gitlab.com/EAVISE/lightnet/blob/master/lightnet/data/transform/_postprocess.py#L116 """ import torch import numpy as np import netharn as nh from netharn.util.nms.torch_nms import torch_nms from netharn.util import non_max_supression import ubelt as ub import itertools as it N = 100 bestof = 10 ydata = ub.ddict(list) # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, 2000] # max number of boxes yolo will spit out at a time max_boxes = 19 * 19 * 5 xdata = [ 10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, max_boxes ] # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500] xdata = [10, 100, 500] rng = nh.util.ensure_rng(0) thresh = 0.5 for num in xdata: print('\n\n---- number of boxes = {} ----\n'.format(num)) outputs = {} # Build random test boxes and scores cpu_boxes = nh.util.Boxes.random(num, scale=10.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr))) # Format boxes in lightnet format cpu_ln_boxes = torch.cat([ cpu_boxes.to_cxywh().data, cpu_scores[:, None], cpu_cls.float()[:, None] ], dim=-1) # Move boxes to numpy np_tlbr = cpu_tlbr.numpy() np_scores = cpu_scores.numpy() np_cls = cpu_cls.numpy() # NOQA gpu = torch.device('cuda', 0) measure_gpu = torch.cuda.is_available() measure_cpu = False or not torch.cuda.is_available() def _ln_output_to_keep(ln_output, ln_boxes): keep = [] for row in ln_output: # Find the index that we kept idxs = np.where(np.all(np.isclose(ln_boxes, row), axis=1))[0] assert len(idxs) == 1 keep.append(idxs[0]) assert np.all(np.isclose(ln_boxes[keep], ln_output)) return keep if measure_gpu: # Move boxes to the GPU gpu_tlbr = cpu_tlbr.to(gpu) gpu_scores = cpu_scores.to(gpu) gpu_cls = cpu_cls.to(gpu) # NOQA gpu_ln_boxes = cpu_ln_boxes.to(gpu) t1 = ub.Timerit(N, bestof=bestof, label='torch(gpu)') for timer in t1: with timer: keep = torch_nms(gpu_tlbr, gpu_scores, thresh=thresh) torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] t1 = ub.Timerit(N, bestof=bestof, label='cython(gpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='gpu') torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) from lightnet.data.transform._postprocess import NonMaxSupression t1 = ub.Timerit(N, bestof=bestof, label='lightnet-slow(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=False) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if False: t1 = ub.Timerit(N, bestof=bestof, label='lightnet-fast(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=True) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if measure_cpu: t1 = ub.Timerit(N, bestof=bestof, label='torch(cpu)') for timer in t1: with timer: keep = torch_nms(cpu_tlbr, cpu_scores, thresh=thresh) ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] if True: t1 = ub.Timerit(N, bestof=bestof, label='cython(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='cpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) t1 = ub.Timerit(N, bestof=bestof, label='numpy(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='py') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) # Check that all kept boxes do not have more than `threshold` ious for key, idxs in outputs.items(): ious = nh.util.box_ious(np_tlbr[idxs], np_tlbr[idxs]) max_iou = (np.tril(ious) - np.eye(len(ious))).max() if max_iou > thresh: print('{} produced a bad result with max_iou={}'.format( key, max_iou)) # Check result consistency: print('\nResult stats:') for key in sorted(outputs.keys()): print(' * {:<20}: num={}'.format(key, len(outputs[key]))) print('\nResult overlaps (method1, method2: jaccard):') datas = [] for k1, k2 in it.combinations(sorted(outputs.keys()), 2): idxs1 = set(outputs[k1]) idxs2 = set(outputs[k2]) jaccard = len(idxs1 & idxs2) / len(idxs1 | idxs2) datas.append((k1, k2, jaccard)) datas = sorted(datas, key=lambda x: -x[2]) for k1, k2, jaccard in datas: print(' * {:<20}, {:<20}: {:0.4f}'.format(k1, k2, jaccard)) nh.util.mplutil.autompl() nh.util.mplutil.multi_plot(xdata, ydata, xlabel='num boxes', ylabel='seconds') nh.util.show_if_requested()
def benchmark_attribute_access(): """ How fast are different methods of accessing attributes? Lets find out! """ instances = { 'simple': Simple(), 'complex': Complex(), 'slot_simple': SimpleWithSlots(), 'slot_complex': ComplexWithSlots(), } import ubelt as ub ti = ub.Timerit(100000, bestof=500, verbose=1, unit='us') # Do this twice, but keep the second measure data = ub.AutoDict() for selfname, self in instances.items(): print(ub.color_text('--- SELF = {} ---'.format(selfname), 'blue')) subdata = data[selfname] = {} for timer in ti.reset('self.attr1'): with timer: self.attr1 subdata[ti.label] = ti.min() for timer in ti.reset('getattr(self, attr1)'): with timer: getattr(self, 'attr1') subdata[ti.label] = ti.min() attrs = ['attr1', 'attr2'] for attrname in attrs: for timer in ti.reset('hasattr(self, {})'.format(attrname)): with timer: hasattr(self, attrname) subdata[ti.label] = ti.min() for timer in ti.reset('getattr(self, {}, None)'.format(attrname)): with timer: getattr(self, attrname, None) subdata[ti.label] = ti.min() if 'slot' not in selfname.lower(): for timer in ti.reset( 'self.__dict__.get({}, None)'.format(attrname)): with timer: self.__dict__.get(attrname, None) subdata[ti.label] = ti.min() for timer in ti.reset('try/except: self.attr2'): with timer: try: x = self.attr2 except AttributeError: x = None subdata[ti.label] = ti.min() for timer in ti.reset('try/except: self.attr1'): with timer: try: x = self.attr1 except AttributeError: x = None subdata[ti.label] = ti.min() del x try: import pandas as pd df = pd.DataFrame(data) * 1e9 try: from kwil.util.util_pandas import _to_string_monkey print(_to_string_monkey(df, key='minima')) except Exception: print(df) except ImportError: print('no pandas') print(ub.repr2(data, nl=2, precision=4))
def _benchmark_cog_conversions(): """ CommandLine: xdoctest -m ~/code/ndsampler/ndsampler/utils/util_gdal.py _benchmark_cog_conversions """ # Benchmark # xdoc: +REQUIRES(--bench) from ndsampler.utils.validate_cog import validate import xdev import kwimage # Prepare test data shape = (8000, 8000, 1) print('Test data shape = {!r}'.format(shape)) data = np.random.randint(0, 255, shape, dtype=np.uint16) print('Test data size = {}'.format( xdev.byte_str(data.size * data.dtype.itemsize))) dpath = ub.ensure_app_cache_dir('ndsampler', 'cog_benchmark') src_fpath = join(dpath, 'src.png') kwimage.imwrite(src_fpath, data) # Benchmark conversions dst_api_fpath = join(dpath, 'dst_api.tiff') dst_cli_fpath = join(dpath, 'dst_cli.tiff') dst_data_fpath = join(dpath, 'dst_data.tiff') ti = ub.Timerit(3, bestof=3, verbose=3, unit='s') compress = 'RAW' compress = 'DEFLATE' blocksize = 256 if 1: for timer in ti.reset('cov-convert-data'): ub.delete(dst_data_fpath) with timer: _imwrite_cloud_optimized_geotiff(dst_data_fpath, data, compress=compress, blocksize=blocksize) assert not len(validate(dst_data_fpath)[1]) for timer in ti.reset('cog-convert-api2'): ub.delete(dst_api_fpath) with timer: _api_convert_cloud_optimized_geotiff2(src_fpath, dst_api_fpath, compress=compress, blocksize=blocksize) assert not len(validate(dst_api_fpath)[1]) for timer in ti.reset('cog-convert-api'): ub.delete(dst_api_fpath) with timer: _api_convert_cloud_optimized_geotiff(src_fpath, dst_api_fpath, compress=compress, blocksize=blocksize) assert not len(validate(dst_api_fpath)[1]) for timer in ti.reset('cog-convert-cli'): ub.delete(dst_cli_fpath) with timer: _cli_convert_cloud_optimized_geotiff(src_fpath, dst_cli_fpath, compress=compress, blocksize=blocksize) assert not len(validate(dst_data_fpath)[1]) if ub.find_exe('cog'): # requires pip install cogeotiff for timer in ti.reset('cogeotiff cli --compress {}'.format(compress)): ub.delete(dst_cli_fpath) with timer: info = ub.cmd( 'cog create {} {} --compress {} --block-size {}'.format( src_fpath, dst_cli_fpath, compress, blocksize), verbose=0) assert info['ret'] == 0 assert not len(validate(dst_data_fpath)[1])