Exemplo n.º 1
0
def test_migrations_no_commit(isolated_runner, old_project):
    """Check --no-commit flag doesn't commit changes."""
    client = LocalClient(path=old_project['path'])
    sha_before = client.repo.head.object.hexsha

    result = isolated_runner.invoke(cli, ['migrate', '--no-commit'])
    assert 0 == result.exit_code
    assert 'OK' in result.output
    assert sha_before == client.repo.head.object.hexsha
Exemplo n.º 2
0
def test_migrate_project(isolated_runner, old_project):
    """Test migrate on old repository."""
    result = isolated_runner.invoke(cli, ["migrate"])
    assert 0 == result.exit_code
    assert not old_project.is_dirty()

    client = LocalClient(path=old_project.working_dir)
    assert client.project
    assert client.project.name
Exemplo n.º 3
0
def test_migrations_no_commit(isolated_runner, old_project):
    """Check --no-commit flag doesn't commit changes."""
    client = LocalClient(path=old_project.working_dir)
    sha_before = client.repo.head.object.hexsha

    result = isolated_runner.invoke(cli, ["migrate", "--no-commit"])
    assert 0 == result.exit_code
    assert "OK" in result.output
    assert sha_before == client.repo.head.object.hexsha
Exemplo n.º 4
0
def test_migration_broken_urls(dataset_metadata):
    """Check that migration of broken dataset file URLs is string."""
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )

    for file_ in dataset.files:
        assert isinstance(url_to_string(file_.url), str)
Exemplo n.º 5
0
def test_author_to_creator_migration(isolated_runner, old_project):
    """Check renaming of author to creator migration."""
    result = isolated_runner.invoke(cli, ["migrate"])
    assert 0 == result.exit_code

    client = LocalClient(path=old_project.working_dir)
    for dataset in client.datasets.values():
        after_metadata = (Path(dataset.path) / client.METADATA).read_text()
        assert "creator:" in after_metadata
        assert "authors:" not in after_metadata
Exemplo n.º 6
0
def test_correct_relative_path(isolated_runner, old_project):
    """Check if path on dataset has been correctly migrated."""
    result = isolated_runner.invoke(cli, ["migrate"])
    assert 0 == result.exit_code

    client = LocalClient(path=old_project.working_dir)
    assert client.datasets

    for ds in client.datasets.values():
        assert not Path(ds.path).is_absolute()
        assert ds.path.startswith(RENKU_HOME)
Exemplo n.º 7
0
def test_latest_version(project):
    """Test returning the latest version of `SoftwareAgent`."""
    from renku import __version__

    create_dataset(
        "ds1",
        title="",
        description="",
        creators=[],
    )

    agent_version = LocalClient(project).latest_agent
    assert __version__ == agent_version
Exemplo n.º 8
0
def test_author_to_creator_migration(isolated_runner, old_project):
    """Check renaming of author to creator migration."""
    client = LocalClient(path=old_project['path'])
    if client.datasets:
        dataset = client.datasets.popitem()[1]
        dataset_path_pre40 = Path(dataset.path.replace('-', ''))
        if dataset_path_pre40.exists():
            metadata = (dataset_path_pre40 / client.METADATA).read_text()

            assert 'authors:' in metadata
            result = isolated_runner.invoke(cli, ['migrate', 'datasets'])
            assert 0 == result.exit_code

            after_metadata = (Path(dataset.path) / client.METADATA).read_text()
            assert 'creator:' in after_metadata
            assert 'authors:' not in after_metadata
Exemplo n.º 9
0
def test_correct_path_migrated(isolated_runner, old_project):
    """Check if path on dataset files has been correctly migrated."""
    result = isolated_runner.invoke(cli, ["migrate"])
    assert 0 == result.exit_code

    client = LocalClient(path=old_project.working_dir)
    assert client.datasets

    for ds in client.datasets.values():
        for file_ in ds.files:
            path_ = Path(file_.path)
            assert path_.exists()
            assert not path_.is_absolute()
            assert file_._label
            assert file_._id
            assert file_.path in file_._label
            assert file_.path in file_._id
Exemplo n.º 10
0
def test_latest_version_user_commits(project):
    """Test retrieval of `SoftwareAgent` with latest non-renku command."""
    from git import Repo

    from renku import __version__

    create_dataset(
        "ds1",
        title="",
        description="",
        creators=[],
    )

    myfile = Path("myfile")
    myfile.write_text("123")

    repo = Repo(project)
    repo.index.add([str(myfile)])
    repo.index.commit("added myfile")

    agent_version = LocalClient(project).latest_agent
    assert __version__ == agent_version
Exemplo n.º 11
0
    def update_dataset_files(self, files, ref, delete=False):
        """Update files and dataset metadata according to their remotes.

        :param files: List of files to be updated
        :param delete: Indicates whether to delete files or not

        :return: List of files that should be deleted
        """
        from renku import LocalClient

        visited_repos = {}
        updated_files = []
        deleted_files = []

        for file_ in files:
            if not file_.based_on:
                continue

            file_.based_on = DatasetFile.from_jsonld(file_.based_on)
            based_on = file_.based_on
            url = based_on.url
            if url in visited_repos:
                repo, repo_path, remote_client = visited_repos[url]
            else:
                repo, repo_path = self.prepare_git_repo(url, ref)
                remote_client = LocalClient(repo_path)
                visited_repos[url] = repo, repo_path, remote_client

            remote_file = self._fetch_file_metadata(remote_client,
                                                    based_on.path)

            if not remote_file:
                try:
                    remote_file = DatasetFile.from_revision(
                        remote_client,
                        path=based_on.path,
                        url=url,
                        added=based_on.added)
                except KeyError:
                    raise errors.ParameterError(
                        'Cannot find file {} in the repo {}'.format(
                            based_on.url, url))

            commit_sha = self._get_commit_sha_from_label(based_on)
            remote_commit_sha = self._get_commit_sha_from_label(remote_file)
            if commit_sha != remote_commit_sha:
                src = Path(repo.working_dir) / based_on.path
                dst = self.renku_path.parent / file_.path

                if src.exists():
                    # Fetch file is it is tracked by Git LFS
                    self._fetch_lfs_files(repo_path, {based_on.path})
                    if remote_client._is_external_file(src):
                        self.remove_file(dst)
                        self._create_external_file(src.resolve(), dst)
                    else:
                        shutil.copy(src, dst)
                    file_.based_on.commit = remote_file.commit
                    file_.based_on._label = remote_file._label
                    updated_files.append(file_)
                else:
                    # File was removed or renamed
                    if delete:
                        self.remove_file(dst)
                    deleted_files.append(file_)

        if not updated_files and (not delete or not deleted_files):
            # Nothing to commit or update
            return deleted_files

        # Commit changes in files

        file_paths = {str(f.path) for f in updated_files + deleted_files}
        # Force-add to include possible ignored files that are in datasets
        self.repo.git.add(*(file_paths), force=True)
        self.repo.index.commit(
            'renku dataset: updated {} files and deleted {} files'.format(
                len(updated_files), len(deleted_files)))

        # Update datasets' metadata

        modified_datasets = {}

        for file_ in updated_files:
            # Re-create list of creators
            creators = []
            # grab all the creators from the commit history
            for commit in repo.iter_commits(paths=file_.path):
                creator = Person.from_commit(commit)
                if creator not in creators:
                    creators.append(creator)

            new_file = DatasetFile.from_revision(self,
                                                 path=file_.path,
                                                 based_on=file_.based_on,
                                                 creator=creators)
            file_.dataset.update_files([new_file])
            modified_datasets[file_.dataset.name] = file_.dataset

        if delete:
            for file_ in deleted_files:
                file_.dataset.unlink_file(file_.path)
                modified_datasets[file_.dataset.name] = file_.dataset

        for dataset in modified_datasets.values():
            dataset.to_yaml()

        return deleted_files
Exemplo n.º 12
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=unused_sources)

        if destination.exists() and not destination.is_dir():
            if len(files) > 1:
                raise errors.ParameterError(
                    'Cannot copy multiple files or directories to a file')

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url,
                                                         creator=creators)

                path_in_dst_repo = dst.relative_to(self.path)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, 'symlink')
                else:
                    operation = (src, dst, 'copy')

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on,
                    'operation': operation
                })

        return results
Exemplo n.º 13
0
def test_comprehensive_dataset_migration(isolated_runner, old_dataset_project):
    """Test migration of old project with all dataset variations."""
    result = isolated_runner.invoke(cli, ["migrate"])
    assert 0 == result.exit_code
    assert "OK" in result.output

    client = LocalClient(path=old_dataset_project.working_dir)

    dataset = client.load_dataset("dataverse")
    assert dataset._id.endswith(
        "/datasets/1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c")
    assert "1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c" == dataset.identifier
    assert "1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c" == dataset._label
    assert "Cornell University" == dataset.creators[0].affiliation
    assert "Rooth, Mats" == dataset.creators[0].name
    assert "Rooth, Mats" == dataset.creators[0].label
    assert dataset.date_published is None
    assert "2020-08-10T21:35:05.115412+00:00" == dataset.date_created.isoformat(
        "T")
    assert "Replication material for a paper to be presented" in dataset.description
    assert "https://doi.org/10.7910/DVN/EV6KLF" == dataset.same_as.url
    assert "1" == dataset.tags[0].name
    assert "Tag 1 created by renku import" == dataset.tags[0].description
    assert isinstance(dataset.license, dict)
    assert "https://creativecommons.org/publicdomain/zero/1.0/" in str(
        dataset.license)

    file_ = dataset.find_file("data/dataverse/copy.sh")
    assert "https://dataverse.harvard.edu/api/access/datafile/3050656" == file_.source
    assert file_.url.endswith(
        "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/dataverse/copy.sh"
    )
    assert "2020-08-10T21:35:10.877832+00:00" == file_.added.isoformat("T")
    assert file_.based_on is None
    assert not hasattr(file_, "creators")

    dataset = client.load_dataset("mixed")
    assert "v1" == dataset.tags[0].name

    file_ = dataset.find_file("data/mixed/Makefile")
    assert file_._id.endswith(
        "/blob/a5f6c3700616e005ac599d24feb7a770430bd93a/data/mixed/Makefile")
    assert "https://github.com/SwissDataScienceCenter/renku-jupyter.git" == file_.source
    assert file_.source == file_.based_on.source
    assert file_.source == file_.based_on.url
    assert "Makefile@49f331d7388785208ccfb3cfb9156b226d9b59ea" == file_.based_on._label
    assert file_.based_on.based_on is None
    assert file_.url.endswith(
        "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/mixed/Makefile"
    )

    file_ = dataset.find_file("data/mixed/data.txt")
    assert file_._id.endswith(
        "/blob/b32138c1bcb2b53da974bbeb842f4d621e155355/data/mixed/data.txt")
    assert "../../../../tmp/data.txt" == file_.source
    assert file_.based_on is None
    assert file_.url.endswith(
        "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/mixed/data.txt"
    )

    file_ = dataset.find_file("README.md")
    assert file_._id.endswith(
        "/blob/0bfb07be3b538e6683e1d2055b5ae4d3a4c593dd/README.md")
    assert "README.md" == file_.source
    assert file_.based_on is None
    assert file_.url.endswith(
        "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/README.md"
    )
Exemplo n.º 14
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self._prepare_git_repo(url, ref)
        copied_sources = set()
        files = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination)

            if result:
                files.add(result)
                source = result[3]
                copied_sources.add(source)

        uncopied_sources = sources - copied_sources
        if uncopied_sources:
            uncopied_sources = {str(s) for s in uncopied_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=uncopied_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _, __ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                path = str(src.resolve().relative_to(repo_path))
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst, _ in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url)

                path_in_dst_repo = dst.relative_to(self.path)

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on
                })

                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy(str(src), str(dst))

        return results
Exemplo n.º 15
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError("No such file or directory", param_hint=unused_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        new_files = []

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    based_on.source = url
                else:
                    based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url)

                path_in_dst_repo = dst.relative_to(self.path)

                if path_in_dst_repo in new_files:  # A path with the same destination is already copied
                    continue

                new_files.append(path_in_dst_repo)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, "symlink")
                else:
                    operation = (src, dst, "copy")

                results.append(
                    {
                        "path": path_in_dst_repo,
                        "source": remove_credentials(url),
                        "parent": self,
                        "based_on": based_on,
                        "operation": operation,
                    }
                )

        return results