def test_force_release_writer_lock_works(managed_tmpdir): repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) co = repo.checkout(write=True) co.metadata['hello'] = 'world' # try to release the writer lock with a process which has different uid with pytest.warns(ResourceWarning): repo.force_release_writer_lock() co._writer_lock == 'LOCK_AVAILABLE' co.close() # replace, but rest of object is closed repo._env._close_environments()
def test_push_clone_digests_exceeding_server_nbyte_limit( server_instance, repo, managed_tmpdir): from hangar.remote import config from hangar import Repository config.config['server']['grpc']['fetch_max_nbytes'] = 100_000 config.config['client']['grpc']['push_max_nbytes'] = 100_000 # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.arraysets.init_arrayset(name='aset', shape=(50, 20), dtype=np.float32) for cIdx in range(4): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.arraysets['aset'] as d: for prevKey in list(d.keys())[1:]: d.remove(prevKey) for sIdx in range(70): arr = np.random.randn(50, 20).astype(np.float32) d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: newRepo.remote.fetch_data('origin', commit=cmt) nco = newRepo.checkout(commit=cmt) assert len(nco.arraysets) == 1 assert 'aset' in nco.arraysets assert len(nco.arraysets['aset']) == 70 for sIdx, samp in enumerate(sampList): assert np.allclose(nco.arraysets['aset'][str(sIdx)], samp) nco.close() newRepo._env._close_environments()
def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, format_, plugin): """Export COLUMN sample data as it existed a STARTPOINT to some format and path. Specifying which sample to be exported is possible by using the switch ``--sample`` (without this, all the samples in the given column will be exported). Since hangar supports both int and str datatype for the sample name, specifying that while mentioning the sample name might be necessary at times. It is possible to do that by separating the name and type by a colon. Example: 1. if the sample name is string of numeric 10 - ``str:10`` or ``10`` 2. if the sample name is ``sample1`` - ``str:sample1`` or ``sample1`` 3. if the sample name is an int, let say 10 - ``int:10`` """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit, get_staging_branch_head from hangar import external kwargs = parse_custom_arguments(ctx.args) if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) elif startpoint: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch_name) co = repo.checkout(commit=base_commit) try: aset = co.columns.get(column) sampleNames = [sample] if sample is not None else list(aset.keys()) extension = format_.lstrip('.') if format_ else None with aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: data = aset[sampleN] formated_sampleN = f'{type(sampleN).__name__}:{sampleN}' try: external.save(data, outdir, formated_sampleN, extension, plugin, **kwargs) except Exception as e: raise click.ClickException(e) except KeyError as e: raise click.ClickException(e) finally: co.close()
def test_local_without_data_fails_no_common(self, written_two_cmt_server_repo, managed_tmpdir): new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) server, _ = written_two_cmt_server_repo repo = Repository(path=new_tmpdir, exists=False) repo.clone('name', '[email protected]', server, remove_old=True) co = repo.checkout() aset = co.arraysets['writtenaset'] with pytest.raises(KeyError): tf_dset = make_tf_dataset(aset, keys=['1', -1]) co.close() repo._env._close_environments()
def test_server_fetch_data_sample( self, two_multi_format_repo_class, managed_tmpdir_class, fetchOp, column_name, keys, tmp_path_factory ): from hangar import Repository cmt, server_instance = two_multi_format_repo_class # Clone test (master branch) _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True) new_tmpdir = str(_new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) # ------------------ format arguments depending on options ----------------- kwargs = { 'column': column_name, 'samples': keys } if fetchOp == 'branch': kwargs['branch'] = 'master' elif fetchOp == 'commit': kwargs['commit'] = cmt else: raise ValueError(f'fetchOp unknown: {fetchOp}') fetch_commit = newRepo.remote.fetch_data_sample(remote='origin', **kwargs) assert fetch_commit == cmt co = newRepo.checkout() try: col = co[column_name] if isinstance(keys, (list, tuple)): if column_name.endswith('flat'): for key in keys: assert col[key] is not None else: for sample in keys: if isinstance(sample, (list, tuple)): if len(sample) == 2: assert col[sample[0]][sample[1]] is not None elif len(sample) == 1: assert col[sample[0]][...] is not None else: assert col[sample][...] is not None finally: co.close() newRepo._env._close_environments()
def test_push_restricted_with_right_username_password( server_instance_push_restricted, repo, managed_tmpdir): from hangar import Repository # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='aset', shape=(50, 20), dtype=np.float32) for cIdx in range(1): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.columns['aset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(70): arr = np.random.randn(50, 20).astype(np.float32) d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance_push_restricted) push1 = repo.remote.push('origin', 'master', username='******', password='******') assert push1 == 'master' # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance_push_restricted, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: newRepo.remote.fetch_data('origin', commit=cmt) nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'aset' in nco.columns assert len(nco.columns['aset']) == 70 for sIdx, samp in enumerate(sampList): assert np.allclose(nco.columns['aset'][str(sIdx)], samp) nco.close() newRepo._env._close_environments()
def create_arrayset(repo: Repository, name, dtype, shape, variable_, named): """Create an arrayset with NAME and DTYPE of SHAPE. The arrayset will be created in the staging area / branch last used by a writer-checkout. Valid NAMEs contain only ascii letters and [``'.'``, ``'_'``, ``'-'``] (no whitespace). The DTYPE must be one of [``'UINT8'``, ``'INT8'``, ``'UINT16'``, ``'INT16'``, ``'UINT32'``, ``'INT32'``, ``'UINT64'``, ``'INT64'``, ``'FLOAT16'``, ``'FLOAT32'``, ``'FLOAT64'``]. The SHAPE must be the last argument(s) specified, where each dimension size is identified by a (space seperated) list of numbers. Examples: To specify, an arrayset for some training images of dtype uint8 and shape (256, 256, 3) we should say: .. code-block:: console $ hangar arrayset create train_images UINT8 256 256 3 To specify that the samples can be variably shaped (have any dimension size up to the maximum SHAPE specified) we would say: .. code-block:: console $ hangar arrayset create train_images UINT8 256 256 3 --variable-shape or equivalently: .. code-block:: console $ hangar arrayset create --variable-shape train_images UINT8 256 256 3 """ try: co = repo.checkout(write=True) aset = co.arraysets.init_arrayset(name=name, shape=shape, dtype=np.typeDict[dtype.lower()], named_samples=named, variable_shape=variable_) click.echo(f'Initialized Arrayset: {aset.name}') except (ValueError, LookupError, PermissionError) as e: raise click.ClickException(e) finally: try: co.close() except NameError: pass
def test_push_fetch_records(server_instance, backend): runner = CliRunner() with runner.isolated_filesystem(): repo = Repository(getcwd(), exists=False) repo.init('foo', 'bar') dummyData = np.arange(50) co1 = repo.checkout(write=True, branch='master') co1.arraysets.init_arrayset(name='dummy', prototype=dummyData, named_samples=True, backend=backend) for idx in range(10): dummyData[:] = idx co1.arraysets['dummy'][str(idx)] = dummyData co1.metadata['hello'] = 'world' co1.metadata['somemetadatakey'] = 'somemetadatavalue' cmt1 = co1.commit('first commit adding dummy data and hello meta') co1.close() repo.create_branch('testbranch') co2 = repo.checkout(write=True, branch='testbranch') for idx in range(10, 20): dummyData[:] = idx co2.arraysets['dummy'][str(idx)] = dummyData co2.metadata['foo'] = 'bar' cmt2 = co2.commit( 'first commit on test branch adding non-conflict data and meta') co2.close() repo.remote.add('origin', server_instance) res = runner.invoke(cli.push, ['origin', 'master'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.push, ['origin', 'testbranch'], obj=repo) assert res.exit_code == 0
def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): """Import file or directory of files at PATH to COLUMN in the staging area. If passing in a directory, all files in the directory will be imported, if passing in a file, just that files specified will be imported """ # TODO: ignore warning through env variable from types import GeneratorType from hangar import external from hangar.records.heads import get_staging_branch_head kwargs = parse_custom_arguments(ctx.args) if branch is None: branch = get_staging_branch_head(repo._env.branchenv) elif branch not in repo.list_branches(): raise click.ClickException( f'Branch name: {branch} does not exist, Exiting.') click.echo(f'Writing to branch: {branch}') co = repo.checkout(write=True, branch=branch) try: active_aset = co.columns.get(column) p = Path(path) files = [f.resolve() for f in p.iterdir()] if p.is_dir() else [p.resolve()] with active_aset as aset, click.progressbar(files) as filesBar: for f in filesBar: ext = ''.join(f.suffixes).strip( '.') # multi-suffix files (tar.bz2) loaded = external.load(f, plugin=plugin, extension=ext, **kwargs) if not isinstance(loaded, GeneratorType): loaded = [loaded] for arr, fname in loaded: if (not overwrite) and (fname in aset): continue try: aset[fname] = arr except ValueError as e: click.echo(e) except (ValueError, KeyError) as e: raise click.ClickException(e) finally: co.close()
def remove_column(repo: Repository, name): """Delete the column NAME (and all samples) from staging area. The column will be removed from the staging area / branch last used by a writer-checkout. """ try: co = repo.checkout(write=True) removed = co.columns.delete(name) click.echo(f'Successfully removed column: {removed}') except (ValueError, KeyError, PermissionError) as e: raise click.ClickException(e) finally: try: co.close() except NameError: pass
def test_checkout_writer_branch_lock_held_errors(dummy_repo: Repository): from hangar.records.heads import get_staging_branch_head dummy_repo.create_branch('testbranch') co = dummy_repo.checkout(write=True, branch='master') try: runner = CliRunner() res = runner.invoke(cli.checkout, ['testbranch'], obj=dummy_repo) assert res.exit_code == 1 msg = res.stdout assert msg.startswith('Error: Cannot acquire the writer lock.') is True recorded_branch = get_staging_branch_head(dummy_repo._env.branchenv) assert recorded_branch == 'master' assert dummy_repo.writer_lock_held is True assert co.branch_name == 'master' finally: co.close() assert dummy_repo.writer_lock_held is False
class MakeCommit(object): params = [(5_000, 20_000), (5_000, 20_000)] param_names = ['num_samples', 'num_metadata'] processes = 2 repeat = (2, 4, 20) number = 1 warmup_time = 0 def setup(self, num_samples, num_metadata): self.tmpdir = mkdtemp() self.repo = Repository(path=self.tmpdir, exists=False) self.repo.init('tester', '*****@*****.**', remove_old=True) self.co = self.repo.checkout(write=True) arr = np.array([ 0, ], dtype=np.uint8) try: aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10') except TypeError: aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10') except AttributeError: aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10') with aset as cm_aset: for i in range(num_samples): arr[:] = i % 255 cm_aset[i] = arr with self.co.metadata as cm_meta: for i in range(num_metadata): cm_meta[i] = f'{i % 500} data' def teardown(self, num_samples, num_metadata): self.co.close() self.repo._env._close_environments() rmtree(self.tmpdir) def time_commit(self, num_samples, num_metadata): self.co.commit('hello')
def import_data(repo: Repository, arrayset, path, branch, plugin, overwrite): """Import file(s) at PATH to ARRAYSET in the staging area. """ from hangar.cli.io import imread from hangar.records.heads import get_staging_branch_head try: if branch is not None: if branch in repo.list_branches(): branch_name = branch else: click.echo(f'Branch name: {branch} does not exist, Exiting.') return None else: branch_name = get_staging_branch_head(repo._env.branchenv) click.echo(f'Writing to branch: {branch_name}') with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) co = repo.checkout(write=True, branch=branch_name) aset = co.arraysets.get(arrayset) if os.path.isfile(path): fname = os.path.basename(path) if not overwrite: if fname in aset: click.echo(f'skipping existing name: {fname} as overwrite flag not set') return None fNamePth = [(fname, path)] else: fnames = os.listdir(path) if not overwrite: fnames = [fname for fname in fnames if fname not in aset] fNamePth = [(fname, os.path.join(path, fname)) for fname in fnames] with aset as a, click.progressbar(fNamePth) as fnamesBar: for fn, fpth in fnamesBar: arr = imread(fpth, plugin=plugin) try: a[fn] = arr except ValueError as e: click.echo(e) finally: co.close()
def export_data(repo: Repository, startpoint, arrayset, out, sample, format_, plugin): """export ARRAYSET sample data as it existed a STARTPOINT to some format and path. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.cli.io import imsave if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) try: co = repo.checkout(write=False, commit=base_commit) arrayset = co.arraysets[arrayset] if sample: sampleNames = [sample] else: sampleNames = list(arrayset.keys()) if format_: format_ = format_.lstrip('.') outP = os.path.expanduser(os.path.normpath(out)) with arrayset as aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: if format_: if sampleN.endswith(format_): outFP = os.path.join(outP, f'{sampleN}') else: outFP = os.path.join(outP, f'{sampleN}.{format_}') else: outFP = os.path.join(outP, f'{sampleN}') try: data = aset[sampleN] imsave(outFP, data) except KeyError as e: click.echo(e) finally: co.close()
def test_force_release_writer_lock(managed_tmpdir, monkeypatch): repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) co = repo.checkout(write=True) orig_lock = str(co._writer_lock) def mock_true(*args, **kwargs): return True # try to release the writer lock with a process which has different uid co._writer_lock = 'lololol' with pytest.raises(RuntimeError): monkeypatch.setattr(co, '_verify_alive', mock_true) monkeypatch.setattr(co._columns, '_destruct', mock_true) co.close() # replace, but rest of object is closed monkeypatch.setattr(co, '_writer_lock', orig_lock) monkeypatch.delattr(co._columns, '_destruct') co.close() repo._env._close_environments()
def test_force_release_writer_lock(managed_tmpdir, monkeypatch): from hangar.records import heads repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) co = repo.checkout(write=True) orig_lock = str(co._writer_lock) def mock_true(*args, **kwargs): return True co.metadata['hello'] = 'world' # try to release the writer lock with a process which has different uid co._writer_lock = 'lololol' with pytest.raises(RuntimeError): monkeypatch.setattr(co, '_WriterCheckout__acquire_writer_lock', mock_true) co.close() # replace, but rest of object is closed monkeypatch.setattr(co, '_writer_lock', orig_lock) co.close() repo._env._close_environments()
def commit(repo: Repository, message): """Commits outstanding changes. Commit changes to the given files into the repository. You will need to 'push' to push up your changes to other repositories. """ from hangar.records.summarize import status co = repo.checkout(write=True) try: if not message: diff = co.diff.staged() status_txt = status(co._hashenv, co.branch_name, diff.diff) status_txt.seek(0) marker = '# Changes To Be committed: \n' hint = ['\n', '\n', marker, '# \n'] for line in status_txt.readlines(): hint.append(f'# {line}') # open default system editor message = click.edit(''.join(hint)) if message is None: click.echo('Aborted!') return msg = message.split(marker)[0].rstrip() if not msg: click.echo('Aborted! Empty commit message') return else: msg = '\n'.join(message) click.echo('Commit message:\n' + msg) try: digest = co.commit(msg) click.echo(f'Commit Successful. Digest: {digest}') except RuntimeError as e: raise click.ClickException(e) finally: co.close()
def view_data(repo: Repository, startpoint, arrayset, sample, plugin): """Use a plugin to view the data of some SAMPLE in ARRAYSET at STARTPOINT. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.cli.io import imshow, show if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) try: co = repo.checkout(write=False, commit=base_commit) arrayset = co.arraysets[arrayset] try: data = arrayset[sample] imshow(data, plugin=plugin) show() except KeyError as e: click.echo(e) finally: co.close()
def test_server_push_two_branch_then_clone_fetch_data_options( server_instance, repo, managed_tmpdir, array5by7, nMasterCommits, nMasterSamples, nDevCommits, nDevSamples, fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history): from hangar import Repository from hangar.records.summarize import list_history # Push master branch test masterCmts = {} co = repo.checkout(write=True) co.arraysets.init_arrayset(name='writtenaset', shape=(5, 7), dtype=np.float32) co.arraysets.init_arrayset(name='_two', shape=(20), dtype=np.float32) for cIdx in range(nMasterCommits): if cIdx != 0: co = repo.checkout(write=True) masterSampList1 = [] masterSampList2 = [] with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd: for prevKey in list(d.keys())[1:]: d.remove(prevKey) dd.remove(prevKey) for sIdx in range(nMasterSamples): arr1 = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr1 masterSampList1.append(arr1) arr2 = np.random.randn(20).astype(np.float32) dd[str(sIdx)] = arr2 masterSampList2.append(arr2) cmt = co.commit(f'master commit number: {cIdx}') masterCmts[cmt] = (masterSampList1, masterSampList2) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') # Push dev branch test devCmts = masterCmts.copy() branch = repo.create_branch('testbranch') for cIdx in range(nDevCommits): co = repo.checkout(write=True, branch=branch.name) devSampList1 = [] devSampList2 = [] with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd: for prevKey in list(d.keys())[1:]: d.remove(prevKey) dd.remove(prevKey) for sIdx in range(nDevSamples): arr1 = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr1 devSampList1.append(arr1) arr2 = np.random.randn(20).astype(np.float32) dd[str(sIdx)] = arr2 devSampList2.append(arr2) cmt = co.commit(f'dev commit number: {cIdx}') devCmts[cmt] = (devSampList1, devSampList2) co.close() push2 = repo.remote.push('origin', branch.name) assert push2 == branch.name branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name) # -------------------------- end setup ------------------------------------ # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) newRepo.remote.fetch('origin', branch=branch.name) newRepo.create_branch('testbranch', base_commit=branchHist['head']) assert newRepo.list_branches() == [ 'master', 'origin/master', f'origin/{branch.name}', branch.name ] # ------------------ format arguments dependingon options ----------------- kwargs = { 'arrayset_names': fetchAsetns, 'max_num_bytes': fetchNbytes, 'retrieve_all_history': fetchAll_history, } if fetchBranch is not None: func = branchHist if fetchBranch == 'testbranch' else masterHist kwargs['branch'] = fetchBranch kwargs['commit'] = None else: func = branchHist if fetchBranch == 'br' else masterHist kwargs['branch'] = None kwargs['commit'] = func['head'] if fetchAll_history is True: commits_to_check = func['order'] else: commits_to_check = [func['head']] # ----------------------- retrieve data with desired options -------------- # This case should fail if (fetchAll_history is True) and isinstance(fetchNbytes, int): try: with pytest.raises(ValueError): fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) finally: newRepo._env._close_environments() return True # get data fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) assert commits_to_check == fetch_commits # ------------- check that you got everything you expected ---------------- for fCmt in fetch_commits: co = newRepo.checkout(commit=fCmt) assert co.commit_hash == fCmt # when we are checking one aset only if isinstance(fetchAsetns, tuple): d = co.arraysets[fetchAsetns[0]] # ensure we didn't fetch the other data simultaneously ds1SampList, ds2SampList = devCmts[fCmt] if fetchAsetns[0] == 'writtenaset': compare = ds1SampList else: compare = ds2SampList totalSeen = 0 for idx, samp in enumerate(compare): if fetchNbytes is None: assert np.allclose(samp, d[str(idx)]) else: try: arr = d[str(idx)] assert np.allclose(samp, arr) totalSeen += arr.nbytes except FileNotFoundError: pass assert totalSeen <= fetchNbytes # compare both asets at the same time else: d = co.arraysets['writtenaset'] dd = co.arraysets['_two'] ds1List, ds2List = devCmts[fCmt] totalSeen = 0 for idx, ds1ds2 in enumerate(zip(ds1List, ds2List)): ds1, ds2 = ds1ds2 if fetchNbytes is None: assert np.allclose(ds1, d[str(idx)]) assert np.allclose(ds2, dd[str(idx)]) else: try: arr1 = d[str(idx)] assert np.allclose(ds1, arr1) totalSeen += arr1.nbytes except FileNotFoundError: pass try: arr2 = dd[str(idx)] assert np.allclose(ds2, arr2) totalSeen += arr2.nbytes except FileNotFoundError: pass assert totalSeen <= fetchNbytes co.close() newRepo._env._close_environments()
class StockRoom: """ This class is the only user entrypoint of stockroom that interacts with an existing stock repository i.e. all the repository interaction a user would do will have to go through an object of this class. Also, stockroom comes with three different storages 1. Model: Weights of models built with ``keras.Model`` or ``torch.nn`` 2. Data: Dataset as numpy arrays/tensors 3. Experiment: Information related to an experiment such as metrics, parameters etc An object of this class holds an object to these three storages each has a dictionary style access machinery """ def __init__(self, path: Union[str, Path] = None, enable_write: bool = False): self.path = Path(path) if path else get_stock_root(Path.cwd()) self._repo = Repository(self.path) self.head = get_current_head( self.path ) # TODO: should this be None if writer enabled if enable_write: self.accessor = self._repo.checkout(write=True) else: if not self.head: self.accessor = None else: self.accessor = self._repo.checkout(commit=self.head) # TODO: Test this extensively if self.accessor is not None: with ExitStack() as stack: stack.enter_context(self.accessor) self._stack = stack.pop_all() self.data = Data(self.accessor) self.model = Model(self.accessor) self.experiment = Experiment(self.accessor) @contextmanager def enable_write( self, autocommit=True, commit_msg=f"Auto-committing at {time.time()}" ): if isinstance(self.accessor, WriterCheckout): warnings.warn( "Write access is already enabled. Doing nothing!!", UserWarning ) yield else: self.accessor = self._repo.checkout(write=True) self.data = Data(self.accessor) self.model = Model(self.accessor) self.experiment = Experiment(self.accessor) with self.accessor: yield if autocommit and self.accessor.diff.status() != "CLEAN": self.commit(commit_msg) self.accessor.close() # TODO: these objects doesn't need to recreate no column creation inside the CM. # Find a way to track that self.accessor = self._repo.checkout() self.data = Data(self.accessor) self.model = Model(self.accessor) self.experiment = Experiment(self.accessor) def update_head(self): if self._repo.writer_lock_held: logger.info( "Write enabled checkouts will always be on the latest head " "(staging). Doing nothing" ) return self.head = get_current_head(self.path) self.accessor.__exit__() self.accessor.close() self.accessor = self._repo.checkout(commit=self.head).__enter__() def close(self): self._stack.close() self.accessor.close() @property def stockroot(self) -> Path: """ Returns the root of stock repository """ return self.path def commit(self, message: str, update_head=True) -> str: """ Make a stock commit. A stock commit is a hangar commit plus writing the commit hash to the stock file. This function opens the stock checkout in write mode and close after the commit. Which means, no other write operations should be running while stock commit is in progress """ digest = self.accessor.commit(message) set_current_head(self.stockroot, digest) if update_head: self.update_head() return digest def __getstate__(self): if isinstance(self.accessor, WriterCheckout): raise RuntimeError("Write enabled instance is not pickle-able") return self.__dict__
if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--gpus', type=int, default=None) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--max_epochs', type=int, default=1) parser.add_argument('--max_elems', type=int, default=60000) parser.add_argument('--hangar', action='store_true') args = parser.parse_args() repo = Repository(path=Path(__file__).parent / "hangar") co = repo.checkout() if args.hangar: dataset = make_torch_dataset( [co.columns['digits'], co.columns['label']], index_range=slice(0, args.max_elems)) else: dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()) print(len(dataset)) datapoint, label = dataset[0] print(type(datapoint), type(label)) print("making a loader!") train_loader = DataLoader(dataset, batch_size=args.batch_size,
def create_column(repo: Repository, name, dtype, shape, variable_, subsamples_): """Create an column with NAME and DTYPE of SHAPE. The column will be created in the staging area / branch last used by a writer-checkout. Valid NAMEs contain only ascii letters and [``'.'``, ``'_'``, ``'-'``] (no whitespace). The DTYPE must be one of [``'UINT8'``, ``'INT8'``, ``'UINT16'``, ``'INT16'``, ``'UINT32'``, ``'INT32'``, ``'UINT64'``, ``'INT64'``, ``'FLOAT16'``, ``'FLOAT32'``, ``'FLOAT64'``, ``'STR'``]. If a ndarray dtype is specified (not 'STR'), then the SHAPE must be the last argument(s) specified, where each dimension size is identified by a (space seperated) list of numbers. Examples: To specify, a column for some training images of dtype uint8 and shape (256, 256, 3) we should say: .. code-block:: console $ hangar column create train_images UINT8 256 256 3 To specify that the samples can be variably shaped (have any dimension size up to the maximum SHAPE specified) we would say: .. code-block:: console $ hangar column create train_images UINT8 256 256 3 --variable-shape or equivalently: .. code-block:: console $ hangar column create --variable-shape train_images UINT8 256 256 3 To specify that the column contains a nested set of subsample data under a common sample key, the ``--contains-subsamples`` flag can be used. .. code-block:: console $ hangar column create --contains-subsamples train_images UINT8 256 256 3 """ try: co = repo.checkout(write=True) if dtype == 'STR': col = co.add_str_column(name=name, contains_subsamples=subsamples_) else: col = co.add_ndarray_column(name=name, shape=shape, dtype=np.typeDict[dtype.lower()], variable_shape=variable_, contains_subsamples=subsamples_) click.echo(f'Initialized Column: {col.column}') except (ValueError, LookupError, PermissionError) as e: raise click.ClickException(e) finally: try: co.close() except NameError: pass
def test_server_push_two_branch_then_clone_fetch_data_options( self, two_branch_multi_commit_repo_class, managed_tmpdir_class, array5by7_class, fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history, tmp_path_factory): from hangar import Repository from operator import eq branch, branchHist, devCmts, masterHist, server_instance = two_branch_multi_commit_repo_class # Clone test (master branch) _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True) new_tmpdir = str(_new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) newRepo.remote.fetch('origin', branch=branch.name) newRepo.create_branch('testbranch', base_commit=branchHist['head']) assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}', branch.name] # ------------------ format arguments depending on options ----------------- kwargs = { 'column_names': fetchAsetns, 'max_num_bytes': fetchNbytes, 'retrieve_all_history': fetchAll_history, } if fetchBranch is not None: func = branchHist if fetchBranch == 'testbranch' else masterHist kwargs['branch'] = fetchBranch kwargs['commit'] = None else: func = branchHist if fetchBranch == 'br' else masterHist kwargs['branch'] = None kwargs['commit'] = func['head'] if fetchAll_history is True: commits_to_check = func['order'] else: commits_to_check = [func['head']] # ----------------------- retrieve data with desired options -------------- # This case should fail if (fetchAll_history is True) and isinstance(fetchNbytes, int): try: with pytest.raises(ValueError): fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) finally: newRepo._env._close_environments() return True # get data fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) assert commits_to_check == fetch_commits # ------------- check that you got everything you expected ---------------- for fCmt in fetch_commits: co = newRepo.checkout(commit=fCmt) assert co.commit_hash == fCmt # when we are checking one aset only if isinstance(fetchAsetns, tuple): d = co.columns[fetchAsetns[0]] # ensure we didn't fetch the other data simultaneously ds1SampList, ds2SampList, ds3SampList, ds4SampList = devCmts[fCmt] if fetchAsetns[0] == 'writtenaset': compare = ds1SampList cmp_func = np.allclose elif fetchAsetns[0] == '_two': compare = ds2SampList cmp_func = np.allclose elif fetchAsetns[0] == 'str_col': compare = ds3SampList cmp_func = eq else: compare = ds4SampList cmp_func = eq totalSeen = 0 for idx, samp in enumerate(compare): if fetchNbytes is None: assert cmp_func(samp, d[str(idx)]) else: try: arr = d[str(idx)] assert cmp_func(samp, arr) try: totalSeen += arr.nbytes except AttributeError: totalSeen += len(arr) except FileNotFoundError: pass assert totalSeen <= fetchNbytes # compare both asets at the same time else: d = co.columns['writtenaset'] dd = co.columns['_two'] str_col = co.columns['str_col'] bytes_col = co.columns['bytes_col'] ds1List, ds2List, ds3List, ds4List = devCmts[fCmt] totalSeen = 0 for idx, ds1ds2ds3ds4 in enumerate(zip(ds1List, ds2List, ds3List, ds4List)): ds1, ds2, ds3, ds4 = ds1ds2ds3ds4 if fetchNbytes is None: assert np.allclose(ds1, d[str(idx)]) assert np.allclose(ds2, dd[str(idx)]) assert ds3 == str_col[str(idx)] assert ds4 == bytes_col[str(idx)] else: try: arr1 = d[str(idx)] assert np.allclose(ds1, arr1) totalSeen += arr1.nbytes except FileNotFoundError: pass try: arr2 = dd[str(idx)] assert np.allclose(ds2, arr2) totalSeen += arr2.nbytes except FileNotFoundError: pass try: sval = str_col[str(idx)] assert ds3 == sval totalSeen += len(sval.encode()) except FileNotFoundError: pass try: bval = bytes_col[str(idx)] assert ds4 == bval totalSeen += len(bval) except FileNotFoundError: pass assert totalSeen <= fetchNbytes co.close() newRepo._env._close_environments()
def test_server_push_second_branch_with_new_commit_then_clone_partial_fetch( server_instance, repo, managed_tmpdir, array5by7, nMasterCommits, nMasterSamples, nDevCommits, nDevSamples): from hangar import Repository from hangar.records.summarize import list_history # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32) for cIdx in range(nMasterCommits): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.columns['writtenaset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(nMasterSamples): arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100 d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') # Push dev branch test devCmtList = [] branch = repo.create_branch('testbranch') for cIdx in range(nDevCommits): co = repo.checkout(write=True, branch=branch.name) devSampList = [] with co.columns['writtenaset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(nDevSamples): arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100 d[str(sIdx)] = arr devSampList.append(arr) cmt = co.commit(f'dev commit number: {cIdx}') devCmtList.append((cmt, devSampList)) co.close() push2 = repo.remote.push('origin', branch.name) assert push2 == branch.name branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name) # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'writtenaset' in nco.columns assert len(nco.columns['writtenaset']) == nMasterSamples assert nco.columns['writtenaset'].contains_remote_references is True remoteKeys = nco.columns['writtenaset'].remote_reference_keys assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.columns['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.columns['writtenaset'][sIdx] nco.close() cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master') assert cloneMasterHist == masterHist # Fetch test fetch = newRepo.remote.fetch('origin', branch=branch.name) assert fetch == f'origin/{branch.name}' assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}'] for cmt, sampList in devCmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'writtenaset' in nco.columns assert len(nco.columns['writtenaset']) == nDevSamples assert nco.columns['writtenaset'].contains_remote_references is True remoteKeys = nco.columns['writtenaset'].remote_reference_keys assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.columns['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.columns['writtenaset'][sIdx] nco.close() cloneBranchHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name=f'origin/{branch.name}') assert cloneBranchHist == branchHist newRepo._env._close_environments()
class _WriterSuite: processes = 2 repeat = 2 number = 1 warmup_time = 0 def setup(self): # self.method # self.backend self.backend_code = {'numpy_10': '10', 'hdf5_00': '00'} # self.dtype self.type_code = { 'float32': np.float32, 'uint16': np.uint16, } # self.num_samples self.tmpdir = mkdtemp() self.repo = Repository(path=self.tmpdir, exists=False) self.repo.init('tester', '*****@*****.**', remove_old=True) co = self.repo.checkout(write=True) a = np.hamming(100).reshape(100, 1) b = np.hamming(100).reshape(1, 100) c = np.round(a * b * 1000).astype(self.type_code[self.dtype]) arr = np.zeros((100, 100), dtype=c.dtype) arr[:, :] = c try: aset = co.arraysets.init_arrayset( 'aset', prototype=arr, backend_opts=self.backend_code[self.backend]) except TypeError: aset = co.arraysets.init_arrayset( 'aset', prototype=arr, backend=self.backend_code[self.backend]) if self.method == 'read': with aset as cm_aset: for i in range(self.num_samples): arr += 1 cm_aset[i] = arr co.commit('first commit') co.close() self.co = self.repo.checkout(write=False) else: self.arr = arr self.co = co def teardown(self): self.co.close() self.repo._env._close_environments() rmtree(self.tmpdir) def read(self): aset = self.co.arraysets['aset'] ks = list(aset.keys()) with aset as cm_aset: for i in ks: arr = cm_aset[i] def write(self): arr = self.arr aset = self.co.arraysets['aset'] with aset as cm_aset: for i in range(self.num_samples): arr += 1 cm_aset[i] = arr def size(self): return folder_size(self.repo._env.repo_path, recurse=True)
class _ReaderSuite: params = ['hdf5_00', 'hdf5_01', 'numpy_10'] param_names = ['backend'] processes = 2 repeat = (2, 4, 30.0) # repeat == tuple (min_repeat, max_repeat, max_time) number = 3 warmup_time = 0 timeout = 60 def setup_cache(self): backend_code = { 'numpy_10': '10', 'hdf5_00': '00', 'hdf5_01': '01', } sample_shape = (50, 50, 10) num_samples = 3_000 repo = Repository(path=os.getcwd(), exists=False) repo.init('tester', '*****@*****.**', remove_old=True) co = repo.checkout(write=True) component_arrays = [] ndims = len(sample_shape) for idx, shape in enumerate(sample_shape): layout = [1 for i in range(ndims)] layout[idx] = shape component = np.hamming(shape).reshape(*layout) * 100 component_arrays.append(component.astype(np.float32)) arr = np.prod(component_arrays).astype(np.float32) for backend, code in backend_code.items(): try: co.arraysets.init_arrayset(backend, prototype=arr, backend_opts=code) except TypeError: try: co.arraysets.init_arrayset(backend, prototype=arr, backend=code) except ValueError: pass except ValueError: pass except AttributeError: co.add_ndarray_column(backend, prototype=arr, backend=code) try: col = co.columns except AttributeError: col = co.arraysets with col as asets_cm: for aset in asets_cm.values(): changer = 0 for i in range(num_samples): arr[changer, changer, changer] += 1 aset[i] = arr changer += 1 co.commit('first commit') co.close() repo._env._close_environments() def setup(self, backend): self.repo = Repository(path=os.getcwd(), exists=True) self.co = self.repo.checkout(write=False) try: try: self.aset = self.co.columns[backend] except AttributeError: self.aset = self.co.arraysets[backend] except KeyError: raise NotImplementedError def teardown(self, backend): self.co.close() self.repo._env._close_environments() def read(self, backend): with self.aset as cm_aset: for i in cm_aset.keys(): arr = cm_aset[i]
class _WriterSuite: params = ['hdf5_00', 'hdf5_01', 'numpy_10'] param_names = ['backend'] processes = 2 repeat = (2, 4, 30.0) # repeat == tuple (min_repeat, max_repeat, max_time) number = 2 warmup_time = 0 def setup(self, backend): # self.method self.current_iter_number = 0 self.backend_code = { 'numpy_10': '10', 'hdf5_00': '00', 'hdf5_01': '01', } # self.num_samples self.sample_shape = (50, 50, 20) self.tmpdir = mkdtemp() self.repo = Repository(path=self.tmpdir, exists=False) self.repo.init('tester', '*****@*****.**', remove_old=True) self.co = self.repo.checkout(write=True) component_arrays = [] ndims = len(self.sample_shape) for idx, shape in enumerate(self.sample_shape): layout = [1 for i in range(ndims)] layout[idx] = shape component = np.hamming(shape).reshape(*layout) * 100 component_arrays.append(component.astype(np.float32)) self.arr = np.prod(component_arrays).astype(np.float32) try: self.aset = self.co.arraysets.init_arrayset( 'aset', prototype=self.arr, backend_opts=self.backend_code[backend]) except TypeError: try: self.aset = self.co.arraysets.init_arrayset( 'aset', prototype=self.arr, backend=self.backend_code[backend]) except ValueError: raise NotImplementedError except ValueError: raise NotImplementedError except AttributeError: self.aset = self.co.add_ndarray_column( 'aset', prototype=self.arr, backend=self.backend_code[backend]) def teardown(self, backend): self.co.close() self.repo._env._close_environments() rmtree(self.tmpdir) def write(self, backend): arr = self.arr iter_number = self.current_iter_number with self.aset as cm_aset: for i in range(self.num_samples): arr[iter_number, iter_number, iter_number] += 1 cm_aset[i] = arr self.current_iter_number += 1
# # ## Initialize the Repo # In[2]: repo = Repository('~/jjmachan/hangar_examples/mnist') repo.init(user_name='jjmachan', user_email='*****@*****.**', remove_old=True) repo # In[3]: repo # In[4]: co = repo.checkout(write=True) co # In[5]: co # ## Arraysets # These are the structures that are used to store the data as numpy # arrays. Hence only numeric data can be stored. # In[6]: co.arraysets # In[7]:
def test_initial_read_checkout(managed_tmpdir): repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) with pytest.raises(ValueError): repo.checkout() repo._env._close_environments()
def test_push_and_clone_master_linear_history_multiple_commits( server_instance, repo, managed_tmpdir, array5by7, nCommits, nSamples): from hangar import Repository from hangar.records.summarize import list_history cmtList = [] co = repo.checkout(write=True) co.arraysets.init_arrayset(name='writtenaset', shape=(5, 7), dtype=np.float32) for cIdx in range(nCommits): if cIdx != 0: co = repo.checkout(write=True) sampList = [] with co.arraysets['writtenaset'] as d: for prevKey in list(d.keys())[1:]: d.remove(prevKey) for sIdx in range(nSamples): arr = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr sampList.append(arr) cmt = co.commit(f'commit number: {cIdx}') cmtList.append((cmt, sampList)) co.close() masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in cmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.arraysets) == 1 assert 'writtenaset' in nco.arraysets assert len(nco.arraysets['writtenaset']) == len(sampList) assert nco.arraysets['writtenaset'].contains_remote_references is True remoteKeys = nco.arraysets['writtenaset'].remote_reference_keys assert [str(idx) for idx in range(len(sampList))] == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.arraysets['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.arraysets['writtenaset'][sIdx] nco.close() cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master') assert cloneMasterHist == masterHist newRepo._env._close_environments()