def write_file_groups(testdir, sizes, group_size=None): """For each file size (bytes) in `sizes`, write a group of ``nfiles`` files ``{testdir}/filesize_{size}/file_{idx}; idx=0...nfiles-1``, such that each dir ``filesize_{size}`` has approximately ``group_size``. If `group_size` is omitted, then use ``group_size=max(sizes)`` such that the group with the largest file size has only one file. Returns lists of group dirs and file names.""" if group_size is None: group_size = max(sizes) else: assert group_size >= max(sizes), \ f"{size2str(group_size)} < {size2str(max(sizes))}" group_dirs = [] files = [] for _filesize in sizes: filesize = int(_filesize) filesize_str = size2str(filesize) dr = pj(testdir, f'filesize_{filesize_str}') group_dirs.append(dr) if not os.path.exists(dr): os.makedirs(dr, exist_ok=True) nfiles = int(group_size) // filesize assert nfiles >= 1 for idx in range(nfiles): fn = pj(dr, f'file_{idx}') write(fn, filesize) files.append(fn) else: print(f' dir already present: {dr}') return group_dirs, files
def bench_main_blocksize_filesize(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}) main.main({files_dirs}) """) params = [] # single files, test filesize and blocksize max_filesize = maxsize max_blocksize = min(200*MiB, max_filesize) cases = [(np.array([max_filesize]), bytes_logspace(10*KiB, max_blocksize, 20), 'main_blocksize_single'), (bytes_linspace(min(1*MiB, max_filesize//2), max_filesize, 5), np.array([256*KiB]), 'main_filesize_single'), ] for filesize, blocksize, study in cases: testdir = mkdtemp(dir=tmpdir, prefix=study + '_') files = write_single_files(testdir, filesize) this = ps.pgrid(zip(ps.plist('filesize', filesize), ps.plist('filesize_str', map(size2str, filesize)), ps.plist('files_dirs', [[x] for x in files])), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', map(size2str, blocksize)))) params += this study = 'main_blocksize' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = bytes_logspace(10*KiB, min(200*MiB, maxsize), 20) this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', map(size2str, blocksize)))) params += this return stmt, params, {}
def bench_main_parallel(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}, nthreads={nthreads}, nprocs={nprocs}, share_leafs={share_leafs}) main.main({files_dirs}) """) params = [] study = 'main_parallel' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = np.array([256*KiB]) for share_leafs in [True, False]: this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), zip(ps.plist('nthreads', range(1, MAXWORKERS+1)), ps.plist('nworkers', range(1, MAXWORKERS+1))), ps.plist('nprocs', [1]), ps.plist('pool_type', ['thread']), ps.plist('maxsize_str', [size2str(maxsize)]), ps.plist('share_leafs', [share_leafs]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), zip(ps.plist('nprocs', range(1, MAXWORKERS+1)), ps.plist('nworkers', range(1, MAXWORKERS+1))), ps.plist('nthreads', [1]), ps.plist('pool_type', ['proc']), ps.plist('maxsize_str', [size2str(maxsize)]), ps.plist('share_leafs', [share_leafs]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this return stmt, params, {}
def __init__(self, path, alias=None, tmpdir='/tmp/findsame_datadir_cache'): self.path = path self.alias = alias cache_fn = os.path.join(tmpdir, path.replace('/','_')) + '.npy' if os.path.exists(cache_fn): self.sizes = np.load(cache_fn) else: self.sizes = collect_file_sizes([self.path]) os.makedirs(tmpdir, exist_ok=True) np.save(cache_fn, self.sizes) self.cache_fn = cache_fn self.size_str = co.size2str(self.sizes.sum())
def bench_hash_file_parallel(tmpdir, maxsize): params = [] study = 'hash_file_parallel' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) pool_map = {'seq': pl.SequentialPoolExecutor, 'thread': pl.ThreadPoolExecutor, 'proc': pl.ProcessPoolExecutor, 'proc,thread=1': lambda nw: pl.ProcessAndThreadPoolExecutor(nw, 1), 'thread,proc=1': lambda nw: pl.ProcessAndThreadPoolExecutor(1, nw), } ctx = dict(pool_map=pool_map, pl=pl, files=files, worker=_worker_bench_hash_file_parallel, ) setup = cache_flush_setup stmt = """ with pool_map['{pool_type}']({nworkers}) as pool: x=list(pool.map(worker, files)) """ this = ps.pgrid(ps.plist('pool_type', [k for k in pool_map.keys() if k != 'seq']), ps.plist('nworkers', range(1, MAXWORKERS+1)), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), ) params += this # non-pool reference params += [{'study': study, 'pool_type': 'seq', 'nworkers': 1, 'maxsize_str': size2str(maxsize)}] return stmt, params, dict(setup=setup, globals=ctx)
def hist(_xlst, bins=100, norm=False, shift_fac=0.8, labels=None, logx=True, ax=None, logbase=10, density=False): """As in plt.hist, plot multiple histograms for each x in xlst, but use x-axis log scale if logx=True (plt.hist(..., log=True) applies to y). Optional normalization to sum of bin areas = 1. Use step plots for each histogram, and shift them along y if shift_fac > 0. Parameters ---------- xlst : list of 1d arrays Returns ------- fig, ax Notes ----- When logx=True, we exclude empty files b/c of the log scale. When len(xlst) > 1 and shift_fac > 0, histograms are shifted along y for better visability. In that case we turn of y ticks (the bin counts) since it makes no sense in that case. """ xlst = [_xlst] if isinstance(_xlst, np.ndarray) else _xlst if labels is not None: assert len(xlst) == len(labels) if ax is None: fig,ax = plt.subplots() else: fig = ax.get_figure() lastmax = 0.0 for ii,xi in enumerate(xlst): hh,be = histogram(xi, bins=bins, logx=logx, norm=norm, logbase=logbase, density=density) label = None if labels is None else labels[ii] ax.step(be[:-1] + 0.5*np.diff(be), hh + lastmax, label=label, lw=2, where='mid') lastmax += hh.max() * shift_fac if logx: ax.set_xscale('log', basex=logbase) ax.set_xticklabels([co.size2str(int(x)) for x in ax.get_xticks()]) if len(xlst) > 1 and shift_fac > 0: ax.set_yticklabels([]) ax.set_yticks([]) if labels is not None: ax.legend() return fig,ax
def bench_main_parallel_2d(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}, nthreads={nthreads}, nprocs={nprocs}) main.main({files_dirs}) """) params = [] study = 'main_parallel_2d' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = np.array([256*KiB]) this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), ps.plist('nthreads', range(1, MAXWORKERS+1)), ps.plist('nprocs', range(1, MAXWORKERS+1)), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this return stmt, params, {}
def test_size_str(): sizes = [1023, random.randint(1000, 300000000000)] for size in sizes: assert co.str2size(co.size2str(size, prec=30)) == size assert co.size2str(co.str2size('None')) == 'None' assert co.str2size(co.size2str(None)) is None