Exemplo n.º 1
0
def test_gitattributes(path):
    gr = GitRepo(path, create=True)
    # starts without any attributes file
    ok_(not op.exists(op.join(gr.path, '.gitattributes')))
    eq_(gr.get_gitattributes('.')['.'], {})
    # bool is a tag or unsets, anything else is key/value
    gr.set_gitattributes([('*', {'tag': True}), ('*', {'sec.key': 'val'})])
    ok_(op.exists(op.join(gr.path, '.gitattributes')))
    eq_(gr.get_gitattributes('.')['.'], {'tag': True, 'sec.key': 'val'})
    # unset by amending the record, but does not remove notion of the
    # tag entirely
    gr.set_gitattributes([('*', {'tag': False})])
    eq_(gr.get_gitattributes('.')['.'], {'tag': False, 'sec.key': 'val'})
    # attributes file is not added or commited, we can ignore such
    # attributes
    eq_(gr.get_gitattributes('.', index_only=True)['.'], {})

    # we can send absolute path patterns and write to any file, and
    # the patterns will be translated relative to the target file
    gr.set_gitattributes([(op.join(gr.path, 'relative', 'ikethemike/**'), {
        'bang': True
    })],
                         attrfile=op.join('relative', '.gitattributes'))
    # directory and file get created
    ok_(op.exists(op.join(gr.path, 'relative', '.gitattributes')))
    eq_(
        gr.get_gitattributes(
            op.join(gr.path, 'relative', 'ikethemike', 'probe')),
        # always comes out relative to the repo root, even if abs goes in
        {
            op.join('relative', 'ikethemike', 'probe'): {
                'tag': False,
                'sec.key': 'val',
                'bang': True
            }
        })
    if get_encoding_info()['default'] != 'ascii':
        # do not perform this on obscure systems without anything like UTF
        # it is not relevant whether a path actually exists, and paths
        # with spaces and other funky stuff are just fine
        funky = u'{} {}'.format(get_most_obscure_supported_name(),
                                get_most_obscure_supported_name())
        gr.set_gitattributes([(funky, {'this': 'that'})])
        eq_(
            gr.get_gitattributes(funky)[funky], {
                'this': 'that',
                'tag': False,
                'sec.key': 'val',
            })

    # mode='w' should replace the entire file:
    gr.set_gitattributes([('**', {'some': 'nonsense'})], mode='w')
    eq_(gr.get_gitattributes('.')['.'], {'some': 'nonsense'})
Exemplo n.º 2
0
def test_gitattributes(path):
    gr = GitRepo(path, create=True)
    # starts without any attributes file
    ok_(not op.exists(op.join(gr.path, '.gitattributes')))
    eq_(gr.get_gitattributes('.')['.'], {})
    # bool is a tag or unsets, anything else is key/value
    gr.set_gitattributes([('*', {'tag': True}), ('*', {'sec.key': 'val'})])
    ok_(op.exists(op.join(gr.path, '.gitattributes')))
    eq_(gr.get_gitattributes('.')['.'], {'tag': True, 'sec.key': 'val'})
    # unset by amending the record, but does not remove notion of the
    # tag entirely
    gr.set_gitattributes([('*', {'tag': False})])
    eq_(gr.get_gitattributes('.')['.'], {'tag': False, 'sec.key': 'val'})
    # attributes file is not added or commited, we can ignore such
    # attributes
    eq_(gr.get_gitattributes('.', index_only=True)['.'], {})

    # we can send absolute path patterns and write to any file, and
    # the patterns will be translated relative to the target file
    gr.set_gitattributes([
        (op.join(gr.path, 'relative', 'ikethemike/**'), {'bang': True})],
        attrfile=op.join('relative', '.gitattributes'))
    # directory and file get created
    ok_(op.exists(op.join(gr.path, 'relative', '.gitattributes')))
    eq_(gr.get_gitattributes(
        op.join(gr.path, 'relative', 'ikethemike', 'probe')),
        # always comes out relative to the repo root, even if abs goes in
        {op.join('relative', 'ikethemike', 'probe'):
            {'tag': False, 'sec.key': 'val', 'bang': True}})
    if get_encoding_info()['default'] != 'ascii':
        # do not perform this on obscure systems without anything like UTF
        # it is not relevant whether a path actually exists, and paths
        # with spaces and other funky stuff are just fine
        funky = u'{} {}'.format(
            get_most_obscure_supported_name(),
            get_most_obscure_supported_name())
        gr.set_gitattributes([(funky, {'this': 'that'})])
        eq_(gr.get_gitattributes(funky)[funky], {
            'this': 'that',
            'tag': False,
            'sec.key': 'val',
        })

    # mode='w' should replace the entire file:
    gr.set_gitattributes([('**', {'some': 'nonsense'})], mode='w')
    eq_(gr.get_gitattributes('.')['.'], {'some': 'nonsense'})
Exemplo n.º 3
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=_NoAnnexDefault,
            annex=True,
            fake_dates=False,
            cfg_proc=None
    ):
        # TODO: introduced with 0.13, remove with 0.14
        if no_annex is not _NoAnnexDefault:
            # the two mirror options do not agree and the deprecated one is
            # not at default value
            warnings.warn("datalad-create's `no_annex` option is deprecated "
                          "and will be removed in a future release, "
                          "use the reversed-sign `annex` option instead.",
                          DeprecationWarning)
            # honor the old option for now
            annex = not no_annex

        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (
                isinstance(initopts, dict) and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = assure_list(cfg_proc)

        # prep for yield
        res = dict(action='create', path=str(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset" and
                any(check_path == p or check_path in p.parents
                    for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        str(parentds_path),
                        [str(c) for c in conflict])})
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'}
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status': 'error',
                        'message': (
                            'collision with %s (dataset) in dataset %s',
                            str(conflict[0]),
                            str(parentds_path))})
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbrepo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbrepo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbrepo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbrepo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_ in cfg_proc:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res