Exemplo n.º 1
0
def test_force_release_writer_lock_works(managed_tmpdir):
    repo = Repository(path=managed_tmpdir, exists=False)
    repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True)
    co = repo.checkout(write=True)
    co.metadata['hello'] = 'world'

    # try to release the writer lock with a process which has different uid
    with pytest.warns(ResourceWarning):
        repo.force_release_writer_lock()

    co._writer_lock == 'LOCK_AVAILABLE'
    co.close()
    # replace, but rest of object is closed
    repo._env._close_environments()
Exemplo n.º 2
0
def test_push_clone_digests_exceeding_server_nbyte_limit(
        server_instance, repo, managed_tmpdir):
    from hangar.remote import config
    from hangar import Repository

    config.config['server']['grpc']['fetch_max_nbytes'] = 100_000
    config.config['client']['grpc']['push_max_nbytes'] = 100_000

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.arraysets.init_arrayset(name='aset', shape=(50, 20), dtype=np.float32)
    for cIdx in range(4):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.arraysets['aset'] as d:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
            for sIdx in range(70):
                arr = np.random.randn(50, 20).astype(np.float32)
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        newRepo.remote.fetch_data('origin', commit=cmt)
        nco = newRepo.checkout(commit=cmt)
        assert len(nco.arraysets) == 1
        assert 'aset' in nco.arraysets
        assert len(nco.arraysets['aset']) == 70
        for sIdx, samp in enumerate(sampList):
            assert np.allclose(nco.arraysets['aset'][str(sIdx)], samp)
        nco.close()
    newRepo._env._close_environments()
Exemplo n.º 3
0
def export_data(ctx, repo: Repository, column, outdir, startpoint, sample,
                format_, plugin):
    """Export COLUMN sample data as it existed a STARTPOINT to some format and path.

    Specifying which sample to be exported is possible by using the switch
    ``--sample`` (without this, all the samples in the given column will be
    exported). Since hangar supports both int and str datatype for the sample
    name, specifying that while mentioning the sample name might be necessary
    at times. It is possible to do that by separating the name and type by a
    colon.

    Example:

       1. if the sample name is string of numeric 10 - ``str:10`` or ``10``

       2. if the sample name is ``sample1`` - ``str:sample1`` or ``sample1``

       3. if the sample name is an int, let say 10 - ``int:10``
    """
    from hangar.records.commiting import expand_short_commit_digest
    from hangar.records.heads import get_branch_head_commit, get_staging_branch_head
    from hangar import external
    kwargs = parse_custom_arguments(ctx.args)

    if startpoint in repo.list_branches():
        base_commit = get_branch_head_commit(repo._env.branchenv, startpoint)
    elif startpoint:
        base_commit = expand_short_commit_digest(repo._env.refenv, startpoint)
    else:
        branch_name = get_staging_branch_head(repo._env.branchenv)
        base_commit = get_branch_head_commit(repo._env.branchenv, branch_name)

    co = repo.checkout(commit=base_commit)
    try:
        aset = co.columns.get(column)
        sampleNames = [sample] if sample is not None else list(aset.keys())
        extension = format_.lstrip('.') if format_ else None
        with aset, click.progressbar(sampleNames) as sNamesBar:
            for sampleN in sNamesBar:
                data = aset[sampleN]
                formated_sampleN = f'{type(sampleN).__name__}:{sampleN}'
                try:
                    external.save(data, outdir, formated_sampleN, extension,
                                  plugin, **kwargs)
                except Exception as e:
                    raise click.ClickException(e)
    except KeyError as e:
        raise click.ClickException(e)
    finally:
        co.close()
Exemplo n.º 4
0
 def test_local_without_data_fails_no_common(self,
                                             written_two_cmt_server_repo,
                                             managed_tmpdir):
     new_tmpdir = pjoin(managed_tmpdir, 'new')
     mkdir(new_tmpdir)
     server, _ = written_two_cmt_server_repo
     repo = Repository(path=new_tmpdir, exists=False)
     repo.clone('name', '[email protected]', server, remove_old=True)
     co = repo.checkout()
     aset = co.arraysets['writtenaset']
     with pytest.raises(KeyError):
         tf_dset = make_tf_dataset(aset, keys=['1', -1])
     co.close()
     repo._env._close_environments()
Exemplo n.º 5
0
    def test_server_fetch_data_sample(
            self, two_multi_format_repo_class, managed_tmpdir_class,
            fetchOp, column_name, keys, tmp_path_factory
    ):
        from hangar import Repository

        cmt, server_instance = two_multi_format_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

        # ------------------ format arguments depending on options -----------------

        kwargs = {
            'column': column_name,
            'samples': keys
        }
        if fetchOp == 'branch':
            kwargs['branch'] = 'master'
        elif fetchOp == 'commit':
            kwargs['commit'] = cmt
        else:
            raise ValueError(f'fetchOp unknown: {fetchOp}')

        fetch_commit = newRepo.remote.fetch_data_sample(remote='origin', **kwargs)
        assert fetch_commit == cmt

        co = newRepo.checkout()
        try:
            col = co[column_name]
            if isinstance(keys, (list, tuple)):
                if column_name.endswith('flat'):
                    for key in keys:
                        assert col[key] is not None
                else:
                    for sample in keys:
                        if isinstance(sample, (list, tuple)):
                            if len(sample) == 2:
                                assert col[sample[0]][sample[1]] is not None
                            elif len(sample) == 1:
                                assert col[sample[0]][...] is not None
                        else:
                            assert col[sample][...] is not None
        finally:
            co.close()
            newRepo._env._close_environments()
Exemplo n.º 6
0
def test_push_restricted_with_right_username_password(
        server_instance_push_restricted, repo, managed_tmpdir):
    from hangar import Repository

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='aset', shape=(50, 20), dtype=np.float32)
    for cIdx in range(1):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.columns['aset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(70):
                arr = np.random.randn(50, 20).astype(np.float32)
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance_push_restricted)
    push1 = repo.remote.push('origin',
                             'master',
                             username='******',
                             password='******')
    assert push1 == 'master'

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance_push_restricted,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        newRepo.remote.fetch_data('origin', commit=cmt)
        nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'aset' in nco.columns
        assert len(nco.columns['aset']) == 70
        for sIdx, samp in enumerate(sampList):
            assert np.allclose(nco.columns['aset'][str(sIdx)], samp)
        nco.close()
    newRepo._env._close_environments()
Exemplo n.º 7
0
def create_arrayset(repo: Repository, name, dtype, shape, variable_, named):
    """Create an arrayset with NAME and DTYPE of SHAPE.

    The arrayset will be created in the staging area / branch last used by a
    writer-checkout. Valid NAMEs contain only ascii letters and [``'.'``,
    ``'_'``, ``'-'``] (no whitespace). The DTYPE must be one of [``'UINT8'``,
    ``'INT8'``, ``'UINT16'``, ``'INT16'``, ``'UINT32'``, ``'INT32'``,
    ``'UINT64'``, ``'INT64'``, ``'FLOAT16'``, ``'FLOAT32'``, ``'FLOAT64'``].
    The SHAPE must be the last argument(s) specified, where each dimension size
    is identified by a (space seperated) list of numbers.

    Examples:

    To specify, an arrayset for some training images of dtype uint8 and shape
    (256, 256, 3) we should say:

       .. code-block:: console

          $ hangar arrayset create train_images UINT8 256 256 3

    To specify that the samples can be variably shaped (have any dimension size
    up to the maximum SHAPE specified) we would say:

       .. code-block:: console

          $ hangar arrayset create train_images UINT8 256 256 3 --variable-shape

    or equivalently:

       .. code-block:: console

          $ hangar arrayset create --variable-shape train_images UINT8 256 256 3

    """
    try:
        co = repo.checkout(write=True)
        aset = co.arraysets.init_arrayset(name=name,
                                          shape=shape,
                                          dtype=np.typeDict[dtype.lower()],
                                          named_samples=named,
                                          variable_shape=variable_)
        click.echo(f'Initialized Arrayset: {aset.name}')
    except (ValueError, LookupError, PermissionError) as e:
        raise click.ClickException(e)
    finally:
        try:
            co.close()
        except NameError:
            pass
Exemplo n.º 8
0
def test_push_fetch_records(server_instance, backend):

    runner = CliRunner()
    with runner.isolated_filesystem():
        repo = Repository(getcwd(), exists=False)
        repo.init('foo', 'bar')
        dummyData = np.arange(50)
        co1 = repo.checkout(write=True, branch='master')
        co1.arraysets.init_arrayset(name='dummy',
                                    prototype=dummyData,
                                    named_samples=True,
                                    backend=backend)
        for idx in range(10):
            dummyData[:] = idx
            co1.arraysets['dummy'][str(idx)] = dummyData
        co1.metadata['hello'] = 'world'
        co1.metadata['somemetadatakey'] = 'somemetadatavalue'
        cmt1 = co1.commit('first commit adding dummy data and hello meta')
        co1.close()

        repo.create_branch('testbranch')
        co2 = repo.checkout(write=True, branch='testbranch')
        for idx in range(10, 20):
            dummyData[:] = idx
            co2.arraysets['dummy'][str(idx)] = dummyData
        co2.metadata['foo'] = 'bar'
        cmt2 = co2.commit(
            'first commit on test branch adding non-conflict data and meta')
        co2.close()

        repo.remote.add('origin', server_instance)

        res = runner.invoke(cli.push, ['origin', 'master'], obj=repo)
        assert res.exit_code == 0
        res = runner.invoke(cli.push, ['origin', 'testbranch'], obj=repo)
        assert res.exit_code == 0
Exemplo n.º 9
0
def import_data(ctx, repo: Repository, column, path, branch, plugin,
                overwrite):
    """Import file or directory of files at PATH to COLUMN in the staging area.

    If passing in a directory, all files in the directory will be imported, if
    passing in a file, just that files specified will be
    imported
    """
    # TODO: ignore warning through env variable
    from types import GeneratorType
    from hangar import external
    from hangar.records.heads import get_staging_branch_head

    kwargs = parse_custom_arguments(ctx.args)
    if branch is None:
        branch = get_staging_branch_head(repo._env.branchenv)
    elif branch not in repo.list_branches():
        raise click.ClickException(
            f'Branch name: {branch} does not exist, Exiting.')
    click.echo(f'Writing to branch: {branch}')

    co = repo.checkout(write=True, branch=branch)
    try:
        active_aset = co.columns.get(column)
        p = Path(path)
        files = [f.resolve()
                 for f in p.iterdir()] if p.is_dir() else [p.resolve()]
        with active_aset as aset, click.progressbar(files) as filesBar:
            for f in filesBar:
                ext = ''.join(f.suffixes).strip(
                    '.')  # multi-suffix files (tar.bz2)
                loaded = external.load(f,
                                       plugin=plugin,
                                       extension=ext,
                                       **kwargs)
                if not isinstance(loaded, GeneratorType):
                    loaded = [loaded]
                for arr, fname in loaded:
                    if (not overwrite) and (fname in aset):
                        continue
                    try:
                        aset[fname] = arr
                    except ValueError as e:
                        click.echo(e)
    except (ValueError, KeyError) as e:
        raise click.ClickException(e)
    finally:
        co.close()
Exemplo n.º 10
0
def remove_column(repo: Repository, name):
    """Delete the column NAME (and all samples) from staging area.

    The column will be removed from the staging area / branch last used by a
    writer-checkout.
    """
    try:
        co = repo.checkout(write=True)
        removed = co.columns.delete(name)
        click.echo(f'Successfully removed column: {removed}')
    except (ValueError, KeyError, PermissionError) as e:
        raise click.ClickException(e)
    finally:
        try:
            co.close()
        except NameError:
            pass
Exemplo n.º 11
0
def test_checkout_writer_branch_lock_held_errors(dummy_repo: Repository):
    from hangar.records.heads import get_staging_branch_head
    dummy_repo.create_branch('testbranch')
    co = dummy_repo.checkout(write=True, branch='master')
    try:
        runner = CliRunner()
        res = runner.invoke(cli.checkout, ['testbranch'], obj=dummy_repo)
        assert res.exit_code == 1
        msg = res.stdout
        assert msg.startswith('Error: Cannot acquire the writer lock.') is True
        recorded_branch = get_staging_branch_head(dummy_repo._env.branchenv)
        assert recorded_branch == 'master'
        assert dummy_repo.writer_lock_held is True
        assert co.branch_name == 'master'
    finally:
        co.close()
    assert dummy_repo.writer_lock_held is False
Exemplo n.º 12
0
class MakeCommit(object):

    params = [(5_000, 20_000), (5_000, 20_000)]
    param_names = ['num_samples', 'num_metadata']
    processes = 2
    repeat = (2, 4, 20)
    number = 1
    warmup_time = 0

    def setup(self, num_samples, num_metadata):
        self.tmpdir = mkdtemp()
        self.repo = Repository(path=self.tmpdir, exists=False)
        self.repo.init('tester', '*****@*****.**', remove_old=True)
        self.co = self.repo.checkout(write=True)
        arr = np.array([
            0,
        ], dtype=np.uint8)
        try:
            aset = self.co.arraysets.init_arrayset('aset',
                                                   prototype=arr,
                                                   backend_opts='10')
        except TypeError:
            aset = self.co.arraysets.init_arrayset('aset',
                                                   prototype=arr,
                                                   backend='10')
        except AttributeError:
            aset = self.co.add_ndarray_column('aset',
                                              prototype=arr,
                                              backend='10')

        with aset as cm_aset:
            for i in range(num_samples):
                arr[:] = i % 255
                cm_aset[i] = arr
        with self.co.metadata as cm_meta:
            for i in range(num_metadata):
                cm_meta[i] = f'{i % 500} data'

    def teardown(self, num_samples, num_metadata):
        self.co.close()
        self.repo._env._close_environments()
        rmtree(self.tmpdir)

    def time_commit(self, num_samples, num_metadata):
        self.co.commit('hello')
Exemplo n.º 13
0
def import_data(repo: Repository, arrayset, path, branch, plugin, overwrite):
    """Import file(s) at PATH to ARRAYSET in the staging area.
    """
    from hangar.cli.io import imread
    from hangar.records.heads import get_staging_branch_head

    try:
        if branch is not None:
            if branch in repo.list_branches():
                branch_name = branch
            else:
                click.echo(f'Branch name: {branch} does not exist, Exiting.')
                return None
        else:
            branch_name = get_staging_branch_head(repo._env.branchenv)
        click.echo(f'Writing to branch: {branch_name}')

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            co = repo.checkout(write=True, branch=branch_name)
            aset = co.arraysets.get(arrayset)

        if os.path.isfile(path):
            fname = os.path.basename(path)
            if not overwrite:
                if fname in aset:
                    click.echo(f'skipping existing name: {fname} as overwrite flag not set')
                    return None
            fNamePth = [(fname, path)]
        else:
            fnames = os.listdir(path)
            if not overwrite:
                fnames = [fname for fname in fnames if fname not in aset]
            fNamePth = [(fname, os.path.join(path, fname)) for fname in fnames]

        with aset as a, click.progressbar(fNamePth) as fnamesBar:
            for fn, fpth in fnamesBar:
                arr = imread(fpth, plugin=plugin)
                try:
                    a[fn] = arr
                except ValueError as e:
                    click.echo(e)
    finally:
        co.close()
Exemplo n.º 14
0
def export_data(repo: Repository, startpoint, arrayset, out, sample, format_,
                plugin):
    """export ARRAYSET sample data as it existed a STARTPOINT to some format and path.
    """
    from hangar.records.commiting import expand_short_commit_digest
    from hangar.records.heads import get_branch_head_commit
    from hangar.cli.io import imsave

    if startpoint in repo.list_branches():
        base_commit = get_branch_head_commit(repo._env.branchenv, startpoint)
    else:
        base_commit = expand_short_commit_digest(repo._env.refenv, startpoint)

    try:
        co = repo.checkout(write=False, commit=base_commit)
        arrayset = co.arraysets[arrayset]
        if sample:
            sampleNames = [sample]
        else:
            sampleNames = list(arrayset.keys())

        if format_:
            format_ = format_.lstrip('.')
        outP = os.path.expanduser(os.path.normpath(out))

        with arrayset as aset, click.progressbar(sampleNames) as sNamesBar:
            for sampleN in sNamesBar:
                if format_:
                    if sampleN.endswith(format_):
                        outFP = os.path.join(outP, f'{sampleN}')
                    else:
                        outFP = os.path.join(outP, f'{sampleN}.{format_}')
                else:
                    outFP = os.path.join(outP, f'{sampleN}')
                try:
                    data = aset[sampleN]
                    imsave(outFP, data)
                except KeyError as e:
                    click.echo(e)
    finally:
        co.close()
Exemplo n.º 15
0
def test_force_release_writer_lock(managed_tmpdir, monkeypatch):

    repo = Repository(path=managed_tmpdir, exists=False)
    repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True)
    co = repo.checkout(write=True)
    orig_lock = str(co._writer_lock)

    def mock_true(*args, **kwargs):
        return True

    # try to release the writer lock with a process which has different uid
    co._writer_lock = 'lololol'
    with pytest.raises(RuntimeError):
        monkeypatch.setattr(co, '_verify_alive', mock_true)
        monkeypatch.setattr(co._columns, '_destruct', mock_true)
        co.close()
    # replace, but rest of object is closed
    monkeypatch.setattr(co, '_writer_lock', orig_lock)
    monkeypatch.delattr(co._columns, '_destruct')
    co.close()
    repo._env._close_environments()
Exemplo n.º 16
0
def test_force_release_writer_lock(managed_tmpdir, monkeypatch):
    from hangar.records import heads

    repo = Repository(path=managed_tmpdir, exists=False)
    repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True)
    co = repo.checkout(write=True)
    orig_lock = str(co._writer_lock)

    def mock_true(*args, **kwargs):
        return True

    co.metadata['hello'] = 'world'
    # try to release the writer lock with a process which has different uid
    co._writer_lock = 'lololol'
    with pytest.raises(RuntimeError):
        monkeypatch.setattr(co, '_WriterCheckout__acquire_writer_lock',
                            mock_true)
        co.close()
    # replace, but rest of object is closed
    monkeypatch.setattr(co, '_writer_lock', orig_lock)
    co.close()
    repo._env._close_environments()
Exemplo n.º 17
0
def commit(repo: Repository, message):
    """Commits outstanding changes.

    Commit changes to the given files into the repository. You will need to
    'push' to push up your changes to other repositories.
    """
    from hangar.records.summarize import status

    co = repo.checkout(write=True)
    try:
        if not message:
            diff = co.diff.staged()
            status_txt = status(co._hashenv, co.branch_name, diff.diff)
            status_txt.seek(0)
            marker = '# Changes To Be committed: \n'
            hint = ['\n', '\n', marker, '# \n']
            for line in status_txt.readlines():
                hint.append(f'# {line}')
            # open default system editor
            message = click.edit(''.join(hint))
            if message is None:
                click.echo('Aborted!')
                return
            msg = message.split(marker)[0].rstrip()
            if not msg:
                click.echo('Aborted! Empty commit message')
                return
        else:
            msg = '\n'.join(message)

        click.echo('Commit message:\n' + msg)
        try:
            digest = co.commit(msg)
            click.echo(f'Commit Successful. Digest: {digest}')
        except RuntimeError as e:
            raise click.ClickException(e)
    finally:
        co.close()
Exemplo n.º 18
0
def view_data(repo: Repository, startpoint, arrayset, sample, plugin):
    """Use a plugin to view the data of some SAMPLE in ARRAYSET at STARTPOINT.
    """
    from hangar.records.commiting import expand_short_commit_digest
    from hangar.records.heads import get_branch_head_commit
    from hangar.cli.io import imshow, show

    if startpoint in repo.list_branches():
        base_commit = get_branch_head_commit(repo._env.branchenv, startpoint)
    else:
        base_commit = expand_short_commit_digest(repo._env.refenv, startpoint)

    try:
        co = repo.checkout(write=False, commit=base_commit)
        arrayset = co.arraysets[arrayset]
        try:
            data = arrayset[sample]
            imshow(data, plugin=plugin)
            show()
        except KeyError as e:
            click.echo(e)
    finally:
        co.close()
Exemplo n.º 19
0
def test_server_push_two_branch_then_clone_fetch_data_options(
        server_instance, repo, managed_tmpdir, array5by7, nMasterCommits,
        nMasterSamples, nDevCommits, nDevSamples, fetchBranch, fetchCommit,
        fetchAsetns, fetchNbytes, fetchAll_history):
    from hangar import Repository
    from hangar.records.summarize import list_history

    # Push master branch test
    masterCmts = {}
    co = repo.checkout(write=True)
    co.arraysets.init_arrayset(name='writtenaset',
                               shape=(5, 7),
                               dtype=np.float32)
    co.arraysets.init_arrayset(name='_two', shape=(20), dtype=np.float32)
    for cIdx in range(nMasterCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList1 = []
        masterSampList2 = []
        with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
                dd.remove(prevKey)

            for sIdx in range(nMasterSamples):
                arr1 = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr1
                masterSampList1.append(arr1)
                arr2 = np.random.randn(20).astype(np.float32)
                dd[str(sIdx)] = arr2
                masterSampList2.append(arr2)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmts[cmt] = (masterSampList1, masterSampList2)
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'
    masterHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name='master')

    # Push dev branch test
    devCmts = masterCmts.copy()
    branch = repo.create_branch('testbranch')
    for cIdx in range(nDevCommits):
        co = repo.checkout(write=True, branch=branch.name)
        devSampList1 = []
        devSampList2 = []
        with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
                dd.remove(prevKey)

            for sIdx in range(nDevSamples):
                arr1 = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr1
                devSampList1.append(arr1)
                arr2 = np.random.randn(20).astype(np.float32)
                dd[str(sIdx)] = arr2
                devSampList2.append(arr2)
        cmt = co.commit(f'dev commit number: {cIdx}')
        devCmts[cmt] = (devSampList1, devSampList2)
        co.close()

    push2 = repo.remote.push('origin', branch.name)
    assert push2 == branch.name
    branchHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name=branch.name)

    # -------------------------- end setup ------------------------------------

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    newRepo.remote.fetch('origin', branch=branch.name)
    newRepo.create_branch('testbranch', base_commit=branchHist['head'])
    assert newRepo.list_branches() == [
        'master', 'origin/master', f'origin/{branch.name}', branch.name
    ]

    # ------------------ format arguments dependingon options -----------------

    kwargs = {
        'arrayset_names': fetchAsetns,
        'max_num_bytes': fetchNbytes,
        'retrieve_all_history': fetchAll_history,
    }
    if fetchBranch is not None:
        func = branchHist if fetchBranch == 'testbranch' else masterHist
        kwargs['branch'] = fetchBranch
        kwargs['commit'] = None
    else:
        func = branchHist if fetchBranch == 'br' else masterHist
        kwargs['branch'] = None
        kwargs['commit'] = func['head']

    if fetchAll_history is True:
        commits_to_check = func['order']
    else:
        commits_to_check = [func['head']]

    # ----------------------- retrieve data with desired options --------------

    # This case should fail
    if (fetchAll_history is True) and isinstance(fetchNbytes, int):
        try:
            with pytest.raises(ValueError):
                fetch_commits = newRepo.remote.fetch_data(remote='origin',
                                                          **kwargs)
        finally:
            newRepo._env._close_environments()
        return True
    # get data
    fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
    assert commits_to_check == fetch_commits

    # ------------- check that you got everything you expected ----------------

    for fCmt in fetch_commits:
        co = newRepo.checkout(commit=fCmt)
        assert co.commit_hash == fCmt

        # when we are checking one aset only
        if isinstance(fetchAsetns, tuple):
            d = co.arraysets[fetchAsetns[0]]
            # ensure we didn't fetch the other data simultaneously

            ds1SampList, ds2SampList = devCmts[fCmt]
            if fetchAsetns[0] == 'writtenaset':
                compare = ds1SampList
            else:
                compare = ds2SampList

            totalSeen = 0
            for idx, samp in enumerate(compare):
                if fetchNbytes is None:
                    assert np.allclose(samp, d[str(idx)])
                else:
                    try:
                        arr = d[str(idx)]
                        assert np.allclose(samp, arr)
                        totalSeen += arr.nbytes
                    except FileNotFoundError:
                        pass
                    assert totalSeen <= fetchNbytes

        # compare both asets at the same time
        else:
            d = co.arraysets['writtenaset']
            dd = co.arraysets['_two']
            ds1List, ds2List = devCmts[fCmt]
            totalSeen = 0
            for idx, ds1ds2 in enumerate(zip(ds1List, ds2List)):
                ds1, ds2 = ds1ds2
                if fetchNbytes is None:
                    assert np.allclose(ds1, d[str(idx)])
                    assert np.allclose(ds2, dd[str(idx)])
                else:
                    try:
                        arr1 = d[str(idx)]
                        assert np.allclose(ds1, arr1)
                        totalSeen += arr1.nbytes
                    except FileNotFoundError:
                        pass
                    try:
                        arr2 = dd[str(idx)]
                        assert np.allclose(ds2, arr2)
                        totalSeen += arr2.nbytes
                    except FileNotFoundError:
                        pass
                    assert totalSeen <= fetchNbytes
        co.close()
    newRepo._env._close_environments()
Exemplo n.º 20
0
class StockRoom:
    """
    This class is the only user entrypoint of stockroom that interacts with an existing
    stock repository i.e. all the repository interaction a user would do will have to go
    through an object of this class. Also, stockroom comes with three different storages

    1. Model: Weights of models built with ``keras.Model`` or ``torch.nn``
    2. Data: Dataset as numpy arrays/tensors
    3. Experiment: Information related to an experiment such as metrics, parameters etc

    An object of this class holds an object to these three storages each has a dictionary
    style access machinery
    """

    def __init__(self, path: Union[str, Path] = None, enable_write: bool = False):
        self.path = Path(path) if path else get_stock_root(Path.cwd())
        self._repo = Repository(self.path)
        self.head = get_current_head(
            self.path
        )  # TODO: should this be None if writer enabled
        if enable_write:
            self.accessor = self._repo.checkout(write=True)
        else:
            if not self.head:
                self.accessor = None
            else:
                self.accessor = self._repo.checkout(commit=self.head)
        # TODO: Test this extensively
        if self.accessor is not None:
            with ExitStack() as stack:
                stack.enter_context(self.accessor)
                self._stack = stack.pop_all()

        self.data = Data(self.accessor)
        self.model = Model(self.accessor)
        self.experiment = Experiment(self.accessor)

    @contextmanager
    def enable_write(
        self, autocommit=True, commit_msg=f"Auto-committing at {time.time()}"
    ):
        if isinstance(self.accessor, WriterCheckout):
            warnings.warn(
                "Write access is already enabled. Doing nothing!!", UserWarning
            )
            yield
        else:
            self.accessor = self._repo.checkout(write=True)
            self.data = Data(self.accessor)
            self.model = Model(self.accessor)
            self.experiment = Experiment(self.accessor)

            with self.accessor:
                yield
            if autocommit and self.accessor.diff.status() != "CLEAN":
                self.commit(commit_msg)
            self.accessor.close()

            # TODO: these objects doesn't need to recreate no column creation inside the CM.
            #   Find a way to track that
            self.accessor = self._repo.checkout()
            self.data = Data(self.accessor)
            self.model = Model(self.accessor)
            self.experiment = Experiment(self.accessor)

    def update_head(self):
        if self._repo.writer_lock_held:
            logger.info(
                "Write enabled checkouts will always be on the latest head "
                "(staging). Doing nothing"
            )
            return
        self.head = get_current_head(self.path)
        self.accessor.__exit__()
        self.accessor.close()
        self.accessor = self._repo.checkout(commit=self.head).__enter__()

    def close(self):
        self._stack.close()
        self.accessor.close()

    @property
    def stockroot(self) -> Path:
        """
        Returns the root of stock repository
        """
        return self.path

    def commit(self, message: str, update_head=True) -> str:
        """
        Make a stock commit. A stock commit is a hangar commit plus writing the commit
        hash to the stock file. This function opens the stock checkout in write mode and
        close after the commit. Which means, no other write operations should be running
        while stock commit is in progress
        """
        digest = self.accessor.commit(message)
        set_current_head(self.stockroot, digest)
        if update_head:
            self.update_head()
        return digest

    def __getstate__(self):
        if isinstance(self.accessor, WriterCheckout):
            raise RuntimeError("Write enabled instance is not pickle-able")
        return self.__dict__
Exemplo n.º 21
0

if __name__ == '__main__':
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument('--gpus', type=int, default=None)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--max_epochs', type=int, default=1)
    parser.add_argument('--max_elems', type=int, default=60000)
    parser.add_argument('--hangar', action='store_true')
    args = parser.parse_args()

    repo = Repository(path=Path(__file__).parent / "hangar")
    co = repo.checkout()

    if args.hangar:
        dataset = make_torch_dataset(
            [co.columns['digits'], co.columns['label']],
            index_range=slice(0, args.max_elems))
    else:
        dataset = MNIST(os.getcwd(),
                        download=True,
                        transform=transforms.ToTensor())
    print(len(dataset))
    datapoint, label = dataset[0]
    print(type(datapoint), type(label))
    print("making a loader!")
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
Exemplo n.º 22
0
def create_column(repo: Repository, name, dtype, shape, variable_, subsamples_):
    """Create an column with NAME and DTYPE of SHAPE.

    The column will be created in the staging area / branch last used by a
    writer-checkout. Valid NAMEs contain only ascii letters and [``'.'``,
    ``'_'``, ``'-'``] (no whitespace). The DTYPE must be one of [``'UINT8'``,
    ``'INT8'``, ``'UINT16'``, ``'INT16'``, ``'UINT32'``, ``'INT32'``,
    ``'UINT64'``, ``'INT64'``, ``'FLOAT16'``, ``'FLOAT32'``, ``'FLOAT64'``,
    ``'STR'``].

    If a ndarray dtype is specified (not 'STR'), then the SHAPE must be the
    last argument(s) specified, where each dimension size is identified by
    a (space seperated) list of numbers.

    Examples:

    To specify, a column for some training images of dtype uint8 and shape
    (256, 256, 3) we should say:

       .. code-block:: console

          $ hangar column create train_images UINT8 256 256 3

    To specify that the samples can be variably shaped (have any dimension size
    up to the maximum SHAPE specified) we would say:

       .. code-block:: console

          $ hangar column create train_images UINT8 256 256 3 --variable-shape

    or equivalently:

       .. code-block:: console

          $ hangar column create --variable-shape train_images UINT8 256 256 3

    To specify that the column contains a nested set of subsample data under a
    common sample key, the ``--contains-subsamples`` flag can be used.

       .. code-block:: console

          $ hangar column create --contains-subsamples train_images UINT8 256 256 3

    """
    try:
        co = repo.checkout(write=True)
        if dtype == 'STR':
            col = co.add_str_column(name=name, contains_subsamples=subsamples_)
        else:
            col = co.add_ndarray_column(name=name,
                                        shape=shape,
                                        dtype=np.typeDict[dtype.lower()],
                                        variable_shape=variable_,
                                        contains_subsamples=subsamples_)
        click.echo(f'Initialized Column: {col.column}')
    except (ValueError, LookupError, PermissionError) as e:
        raise click.ClickException(e)
    finally:
        try:
            co.close()
        except NameError:
            pass
Exemplo n.º 23
0
    def test_server_push_two_branch_then_clone_fetch_data_options(
            self, two_branch_multi_commit_repo_class, managed_tmpdir_class, array5by7_class,
            fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history, tmp_path_factory):
        from hangar import Repository
        from operator import eq

        branch, branchHist, devCmts, masterHist, server_instance = two_branch_multi_commit_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)
        newRepo.remote.fetch('origin', branch=branch.name)
        newRepo.create_branch('testbranch', base_commit=branchHist['head'])
        assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}', branch.name]

        # ------------------ format arguments depending on options -----------------

        kwargs = {
            'column_names': fetchAsetns,
            'max_num_bytes': fetchNbytes,
            'retrieve_all_history': fetchAll_history,
        }
        if fetchBranch is not None:
            func = branchHist if fetchBranch == 'testbranch' else masterHist
            kwargs['branch'] = fetchBranch
            kwargs['commit'] = None
        else:
            func = branchHist if fetchBranch == 'br' else masterHist
            kwargs['branch'] = None
            kwargs['commit'] = func['head']

        if fetchAll_history is True:
            commits_to_check = func['order']
        else:
            commits_to_check = [func['head']]

        # ----------------------- retrieve data with desired options --------------

        # This case should fail
        if (fetchAll_history is True) and isinstance(fetchNbytes, int):
            try:
                with pytest.raises(ValueError):
                    fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
            finally:
                newRepo._env._close_environments()
            return True
        # get data
        fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
        assert commits_to_check == fetch_commits

        # ------------- check that you got everything you expected ----------------

        for fCmt in fetch_commits:
            co = newRepo.checkout(commit=fCmt)
            assert co.commit_hash == fCmt

            # when we are checking one aset only
            if isinstance(fetchAsetns, tuple):
                d = co.columns[fetchAsetns[0]]
                # ensure we didn't fetch the other data simultaneously

                ds1SampList, ds2SampList, ds3SampList, ds4SampList = devCmts[fCmt]
                if fetchAsetns[0] == 'writtenaset':
                    compare = ds1SampList
                    cmp_func = np.allclose
                elif fetchAsetns[0] == '_two':
                    compare = ds2SampList
                    cmp_func = np.allclose
                elif fetchAsetns[0] == 'str_col':
                    compare = ds3SampList
                    cmp_func = eq
                else:
                    compare = ds4SampList
                    cmp_func = eq

                totalSeen = 0
                for idx, samp in enumerate(compare):
                    if fetchNbytes is None:
                        assert cmp_func(samp, d[str(idx)])
                    else:
                        try:
                            arr = d[str(idx)]
                            assert cmp_func(samp, arr)
                            try:
                                totalSeen += arr.nbytes
                            except AttributeError:
                                totalSeen += len(arr)
                        except FileNotFoundError:
                            pass
                        assert totalSeen <= fetchNbytes

            # compare both asets at the same time
            else:
                d = co.columns['writtenaset']
                dd = co.columns['_two']
                str_col = co.columns['str_col']
                bytes_col = co.columns['bytes_col']
                ds1List, ds2List, ds3List, ds4List = devCmts[fCmt]
                totalSeen = 0
                for idx, ds1ds2ds3ds4 in enumerate(zip(ds1List, ds2List, ds3List, ds4List)):
                    ds1, ds2, ds3, ds4 = ds1ds2ds3ds4
                    if fetchNbytes is None:
                        assert np.allclose(ds1, d[str(idx)])
                        assert np.allclose(ds2, dd[str(idx)])
                        assert ds3 == str_col[str(idx)]
                        assert ds4 == bytes_col[str(idx)]
                    else:
                        try:
                            arr1 = d[str(idx)]
                            assert np.allclose(ds1, arr1)
                            totalSeen += arr1.nbytes
                        except FileNotFoundError:
                            pass
                        try:
                            arr2 = dd[str(idx)]
                            assert np.allclose(ds2, arr2)
                            totalSeen += arr2.nbytes
                        except FileNotFoundError:
                            pass
                        try:
                            sval = str_col[str(idx)]
                            assert ds3 == sval
                            totalSeen += len(sval.encode())
                        except FileNotFoundError:
                            pass
                        try:
                            bval = bytes_col[str(idx)]
                            assert ds4 == bval
                            totalSeen += len(bval)
                        except FileNotFoundError:
                            pass
                        assert totalSeen <= fetchNbytes
            co.close()
        newRepo._env._close_environments()
Exemplo n.º 24
0
def test_server_push_second_branch_with_new_commit_then_clone_partial_fetch(
        server_instance, repo, managed_tmpdir, array5by7, nMasterCommits,
        nMasterSamples, nDevCommits, nDevSamples):
    from hangar import Repository
    from hangar.records.summarize import list_history

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32)
    for cIdx in range(nMasterCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.columns['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(nMasterSamples):
                arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'
    masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master')

    # Push dev branch test
    devCmtList = []
    branch = repo.create_branch('testbranch')
    for cIdx in range(nDevCommits):
        co = repo.checkout(write=True, branch=branch.name)
        devSampList = []
        with co.columns['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(nDevSamples):
                arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100
                d[str(sIdx)] = arr
                devSampList.append(arr)
        cmt = co.commit(f'dev commit number: {cIdx}')
        devCmtList.append((cmt, devSampList))
        co.close()

    push2 = repo.remote.push('origin', branch.name)
    assert push2 == branch.name
    branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name)

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'writtenaset' in nco.columns
        assert len(nco.columns['writtenaset']) == nMasterSamples

        assert nco.columns['writtenaset'].contains_remote_references is True
        remoteKeys = nco.columns['writtenaset'].remote_reference_keys
        assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.columns['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.columns['writtenaset'][sIdx]
        nco.close()
    cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master')
    assert cloneMasterHist == masterHist

    # Fetch test
    fetch = newRepo.remote.fetch('origin', branch=branch.name)
    assert fetch == f'origin/{branch.name}'
    assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}']
    for cmt, sampList in devCmtList:

        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'writtenaset' in nco.columns
        assert len(nco.columns['writtenaset']) == nDevSamples

        assert nco.columns['writtenaset'].contains_remote_references is True
        remoteKeys = nco.columns['writtenaset'].remote_reference_keys
        assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.columns['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.columns['writtenaset'][sIdx]
        nco.close()

    cloneBranchHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name=f'origin/{branch.name}')
    assert cloneBranchHist == branchHist
    newRepo._env._close_environments()
Exemplo n.º 25
0
class _WriterSuite:

    processes = 2
    repeat = 2
    number = 1
    warmup_time = 0

    def setup(self):

        # self.method
        # self.backend
        self.backend_code = {'numpy_10': '10', 'hdf5_00': '00'}
        # self.dtype
        self.type_code = {
            'float32': np.float32,
            'uint16': np.uint16,
        }
        # self.num_samples

        self.tmpdir = mkdtemp()
        self.repo = Repository(path=self.tmpdir, exists=False)
        self.repo.init('tester', '*****@*****.**', remove_old=True)
        co = self.repo.checkout(write=True)

        a = np.hamming(100).reshape(100, 1)
        b = np.hamming(100).reshape(1, 100)
        c = np.round(a * b * 1000).astype(self.type_code[self.dtype])
        arr = np.zeros((100, 100), dtype=c.dtype)
        arr[:, :] = c

        try:
            aset = co.arraysets.init_arrayset(
                'aset',
                prototype=arr,
                backend_opts=self.backend_code[self.backend])
        except TypeError:
            aset = co.arraysets.init_arrayset(
                'aset', prototype=arr, backend=self.backend_code[self.backend])
        if self.method == 'read':
            with aset as cm_aset:
                for i in range(self.num_samples):
                    arr += 1
                    cm_aset[i] = arr
            co.commit('first commit')
            co.close()
            self.co = self.repo.checkout(write=False)
        else:
            self.arr = arr
            self.co = co

    def teardown(self):
        self.co.close()
        self.repo._env._close_environments()
        rmtree(self.tmpdir)

    def read(self):
        aset = self.co.arraysets['aset']
        ks = list(aset.keys())
        with aset as cm_aset:
            for i in ks:
                arr = cm_aset[i]

    def write(self):
        arr = self.arr
        aset = self.co.arraysets['aset']
        with aset as cm_aset:
            for i in range(self.num_samples):
                arr += 1
                cm_aset[i] = arr

    def size(self):
        return folder_size(self.repo._env.repo_path, recurse=True)
Exemplo n.º 26
0
class _ReaderSuite:

    params = ['hdf5_00', 'hdf5_01', 'numpy_10']
    param_names = ['backend']
    processes = 2
    repeat = (2, 4, 30.0)
    # repeat == tuple (min_repeat, max_repeat, max_time)
    number = 3
    warmup_time = 0
    timeout = 60

    def setup_cache(self):

        backend_code = {
            'numpy_10': '10',
            'hdf5_00': '00',
            'hdf5_01': '01',
        }

        sample_shape = (50, 50, 10)
        num_samples = 3_000

        repo = Repository(path=os.getcwd(), exists=False)
        repo.init('tester', '*****@*****.**', remove_old=True)
        co = repo.checkout(write=True)

        component_arrays = []
        ndims = len(sample_shape)
        for idx, shape in enumerate(sample_shape):
            layout = [1 for i in range(ndims)]
            layout[idx] = shape
            component = np.hamming(shape).reshape(*layout) * 100
            component_arrays.append(component.astype(np.float32))
        arr = np.prod(component_arrays).astype(np.float32)

        for backend, code in backend_code.items():
            try:
                co.arraysets.init_arrayset(backend,
                                           prototype=arr,
                                           backend_opts=code)
            except TypeError:
                try:
                    co.arraysets.init_arrayset(backend,
                                               prototype=arr,
                                               backend=code)
                except ValueError:
                    pass
            except ValueError:
                pass
            except AttributeError:
                co.add_ndarray_column(backend, prototype=arr, backend=code)

        try:
            col = co.columns
        except AttributeError:
            col = co.arraysets

        with col as asets_cm:
            for aset in asets_cm.values():
                changer = 0
                for i in range(num_samples):
                    arr[changer, changer, changer] += 1
                    aset[i] = arr
                changer += 1
        co.commit('first commit')
        co.close()
        repo._env._close_environments()

    def setup(self, backend):
        self.repo = Repository(path=os.getcwd(), exists=True)
        self.co = self.repo.checkout(write=False)
        try:
            try:
                self.aset = self.co.columns[backend]
            except AttributeError:
                self.aset = self.co.arraysets[backend]
        except KeyError:
            raise NotImplementedError

    def teardown(self, backend):
        self.co.close()
        self.repo._env._close_environments()

    def read(self, backend):
        with self.aset as cm_aset:
            for i in cm_aset.keys():
                arr = cm_aset[i]
Exemplo n.º 27
0
class _WriterSuite:

    params = ['hdf5_00', 'hdf5_01', 'numpy_10']
    param_names = ['backend']
    processes = 2
    repeat = (2, 4, 30.0)
    # repeat == tuple (min_repeat, max_repeat, max_time)
    number = 2
    warmup_time = 0

    def setup(self, backend):

        # self.method
        self.current_iter_number = 0
        self.backend_code = {
            'numpy_10': '10',
            'hdf5_00': '00',
            'hdf5_01': '01',
        }
        # self.num_samples

        self.sample_shape = (50, 50, 20)

        self.tmpdir = mkdtemp()
        self.repo = Repository(path=self.tmpdir, exists=False)
        self.repo.init('tester', '*****@*****.**', remove_old=True)
        self.co = self.repo.checkout(write=True)

        component_arrays = []
        ndims = len(self.sample_shape)
        for idx, shape in enumerate(self.sample_shape):
            layout = [1 for i in range(ndims)]
            layout[idx] = shape
            component = np.hamming(shape).reshape(*layout) * 100
            component_arrays.append(component.astype(np.float32))
        self.arr = np.prod(component_arrays).astype(np.float32)

        try:
            self.aset = self.co.arraysets.init_arrayset(
                'aset',
                prototype=self.arr,
                backend_opts=self.backend_code[backend])
        except TypeError:
            try:
                self.aset = self.co.arraysets.init_arrayset(
                    'aset',
                    prototype=self.arr,
                    backend=self.backend_code[backend])
            except ValueError:
                raise NotImplementedError
        except ValueError:
            raise NotImplementedError
        except AttributeError:
            self.aset = self.co.add_ndarray_column(
                'aset', prototype=self.arr, backend=self.backend_code[backend])

    def teardown(self, backend):
        self.co.close()
        self.repo._env._close_environments()
        rmtree(self.tmpdir)

    def write(self, backend):
        arr = self.arr
        iter_number = self.current_iter_number
        with self.aset as cm_aset:
            for i in range(self.num_samples):
                arr[iter_number, iter_number, iter_number] += 1
                cm_aset[i] = arr
        self.current_iter_number += 1
Exemplo n.º 28
0
#
# ## Initialize the Repo

# In[2]:

repo = Repository('~/jjmachan/hangar_examples/mnist')
repo.init(user_name='jjmachan', user_email='*****@*****.**', remove_old=True)
repo

# In[3]:

repo

# In[4]:

co = repo.checkout(write=True)
co

# In[5]:

co

# ## Arraysets
# These are the structures that are used to store the data as numpy
# arrays. Hence only numeric data can be stored.

# In[6]:

co.arraysets

# In[7]:
Exemplo n.º 29
0
def test_initial_read_checkout(managed_tmpdir):
    repo = Repository(path=managed_tmpdir, exists=False)
    repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True)
    with pytest.raises(ValueError):
        repo.checkout()
    repo._env._close_environments()
Exemplo n.º 30
0
def test_push_and_clone_master_linear_history_multiple_commits(
        server_instance, repo, managed_tmpdir, array5by7, nCommits, nSamples):
    from hangar import Repository
    from hangar.records.summarize import list_history

    cmtList = []
    co = repo.checkout(write=True)
    co.arraysets.init_arrayset(name='writtenaset',
                               shape=(5, 7),
                               dtype=np.float32)
    for cIdx in range(nCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        sampList = []
        with co.arraysets['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
            for sIdx in range(nSamples):
                arr = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr
                sampList.append(arr)
        cmt = co.commit(f'commit number: {cIdx}')
        cmtList.append((cmt, sampList))
        co.close()
    masterHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name='master')

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'

    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in cmtList:
        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.arraysets) == 1
        assert 'writtenaset' in nco.arraysets
        assert len(nco.arraysets['writtenaset']) == len(sampList)

        assert nco.arraysets['writtenaset'].contains_remote_references is True
        remoteKeys = nco.arraysets['writtenaset'].remote_reference_keys
        assert [str(idx) for idx in range(len(sampList))] == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.arraysets['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.arraysets['writtenaset'][sIdx]
        nco.close()
    cloneMasterHist = list_history(newRepo._env.refenv,
                                   newRepo._env.branchenv,
                                   branch_name='master')
    assert cloneMasterHist == masterHist
    newRepo._env._close_environments()