示例#1
0
    def _put_dataset(self, path: str, username: str, owner: str) -> Dataset:
        # Validate that given path contains a dataset
        _ = self.load_dataset_from_directory(path)

        p = os.path.join(self.inventory_root, username, owner, 'datasets')
        dir_name = os.path.basename(path)
        if os.path.exists(p) and dir_name in os.listdir(p):
            raise InventoryException(
                f"Dataset directory {dir_name} already exists")

        if not os.path.exists(p):
            os.makedirs(p, exist_ok=True)

        if os.path.exists(os.path.join(p, dir_name)):
            raise InventoryException(
                f"Dataset directory {dir_name} already exists")

        final_path = shutil.move(path, p)
        assert os.path.dirname(
            final_path) != 'datasets', f"shutil.move used incorrectly"

        ds = self.load_dataset_from_directory(final_path)

        # link dataset objects
        ds.namespace = owner
        m = Manifest(ds, logged_in_username=username)
        m.link_revision()

        return ds
示例#2
0
    def put_labbook(self, path: str, username: str, owner: str) -> LabBook:
        """ Take given path to a candidate labbook and insert it
        into its proper place in the file system.

        Args:
            path: Path to a given labbook
            username: Active username
            owner: Intended owner of labbook

        Returns:
            LabBook
        """
        try:
            lb = self._put_labbook(path, username, owner)

            # Init dataset submodules if present
            if len(lb.git.repo.submodules) > 0:

                # Link datasets
                for submodule in lb.git.list_submodules():
                    try:

                        namespace, dataset_name = submodule['name'].split("&")
                        rel_submodule_dir = os.path.join(
                            '.gigantum', 'datasets', namespace, dataset_name)
                        submodule_dir = os.path.join(lb.root_dir,
                                                     rel_submodule_dir)
                        call_subprocess(
                            ['git', 'submodule', 'init', rel_submodule_dir],
                            cwd=lb.root_dir,
                            check=True)
                        call_subprocess(
                            ['git', 'submodule', 'update', rel_submodule_dir],
                            cwd=lb.root_dir,
                            check=True)

                        ds = InventoryManager().load_dataset_from_directory(
                            submodule_dir)
                        ds.namespace = namespace
                        manifest = Manifest(ds, username)
                        manifest.link_revision()

                    except Exception as err:
                        logger.exception(
                            f"Failed to import submodule: {submodule['name']}")
                        continue

            return lb
        except Exception as e:
            logger.error(e)
            raise InventoryException(e)
示例#3
0
    def create_dataset(self,
                       username: str,
                       owner: str,
                       dataset_name: str,
                       storage_type: str,
                       description: Optional[str] = None,
                       author: Optional[GitAuthor] = None) -> Dataset:
        """Create a new Dataset in this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace in which to place this Dataset
            dataset_name: Name of the Dataset
            storage_type: String identifying the type of Dataset to instantiate
            description: Optional brief description of Dataset
            author: Optional Git Author

        Returns:
            Newly created LabBook instance

        """
        dataset = Dataset(config_file=self.config_file,
                          author=author,
                          namespace=owner)

        if storage_type not in SUPPORTED_STORAGE_BACKENDS:
            raise ValueError(
                f"Unsupported Dataset storage type: {storage_type}")

        try:
            build_info = Configuration(self.config_file).config['build_info']
        except KeyError:
            logger.warning("Could not obtain build_info from config")
            build_info = None

        # Build data file contents
        dataset._data = {
            "schema": DATASET_CURRENT_SCHEMA,
            "id": uuid.uuid4().hex,
            "name": dataset_name,
            "storage_type": storage_type,
            "description": description or '',
            "created_on": datetime.datetime.utcnow().isoformat(),
            "build_info": build_info
        }
        dataset._validate_gigantum_data()

        logger.info("Creating new Dataset on disk for {}/{}/{}".format(
            username, owner, dataset_name))
        # lock while creating initial directory
        with dataset.lock(
                lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}"
        ):
            # Verify or Create user subdirectory
            # Make sure you expand a user dir string
            starting_dir = os.path.expanduser(
                dataset.client_config.config["git"]["working_directory"])
            user_dir = os.path.join(starting_dir, username)
            if not os.path.isdir(user_dir):
                os.makedirs(user_dir)

            # Create owner dir - store LabBooks in working dir > logged in user > owner
            owner_dir = os.path.join(user_dir, owner)
            if not os.path.isdir(owner_dir):
                os.makedirs(owner_dir)

                # Create `datasets` subdir in the owner dir
                owner_dir = os.path.join(owner_dir, "datasets")
            else:
                owner_dir = os.path.join(owner_dir, "datasets")

            # Verify name not already in use
            if os.path.isdir(os.path.join(owner_dir, dataset_name)):
                raise ValueError(
                    f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name"
                )

            # Create Dataset subdirectory
            new_root_dir = os.path.join(owner_dir, dataset_name)
            os.makedirs(new_root_dir)
            dataset._set_root_dir(new_root_dir)

            # Init repository
            dataset.git.initialize()

            # Create Directory Structure
            dirs = [
                'manifest', 'metadata', '.gigantum',
                os.path.join('.gigantum', 'favorites'),
                os.path.join('.gigantum', 'activity'),
                os.path.join('.gigantum', 'activity', 'log')
            ]

            for d in dirs:
                p = os.path.join(dataset.root_dir, d, '.gitkeep')
                os.makedirs(os.path.dirname(p), exist_ok=True)
                with open(p, 'w') as gk:
                    gk.write(
                        "This file is necessary to keep this directory tracked by Git"
                        " and archivable by compression tools. Do not delete or modify!"
                    )

            dataset._save_gigantum_data()

            # Create an empty storage.json file
            dataset.backend_config = {}

            # Create .gitignore default file
            shutil.copyfile(
                os.path.join(resource_filename('gtmcore', 'dataset'),
                             'gitignore.default'),
                os.path.join(dataset.root_dir, ".gitignore"))

            # Commit
            dataset.git.add_all()

            # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will
            # break activity paging.
            # TODO: Improve method for detecting the first activity record
            dataset.git.commit(f"Creating new empty Dataset: {dataset_name}")

            # Create Activity Record
            adr = ActivityDetailRecord(ActivityDetailType.DATASET,
                                       show=False,
                                       importance=0)
            adr.add_value('text/plain',
                          f"Created new Dataset: {username}/{dataset_name}")
            ar = ActivityRecord(
                ActivityType.DATASET,
                message=f"Created new Dataset: {username}/{dataset_name}",
                show=True,
                importance=255,
                linked_commit=dataset.git.commit_hash)
            ar.add_detail_object(adr)
            store = ActivityStore(dataset)
            store.create_activity_record(ar)

            # Initialize file cache and link revision
            m = Manifest(dataset, username)
            m.link_revision()

            return dataset
示例#4
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               labbook_owner,
                               labbook_name,
                               dataset_owner,
                               dataset_name,
                               action,
                               dataset_url=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        lb = im.load_labbook(logged_in_username,
                             labbook_owner,
                             labbook_name,
                             author=get_logged_in_author())

        with lb.lock():
            if action == 'link':
                if dataset_url:
                    remote_domain = cls._get_remote_domain(
                        dataset_url, dataset_owner, dataset_name)

                    if remote_domain:
                        # Make sure git creds are configured for the remote
                        admin_service = None
                        for remote in lb.client_config.config['git'][
                                'remotes']:
                            if remote_domain == remote:
                                admin_service = lb.client_config.config['git'][
                                    'remotes'][remote]['admin_service']
                                break
                        if "HTTP_AUTHORIZATION" in info.context.headers.environ:
                            token = parse_token(info.context.headers.
                                                environ["HTTP_AUTHORIZATION"])
                        else:
                            raise ValueError(
                                "Authorization header not provided."
                                " Must have a valid session to query for collaborators"
                            )
                        mgr = GitLabManager(remote_domain, admin_service,
                                            token)
                        mgr.configure_git_credentials(remote_domain,
                                                      logged_in_username)
                else:
                    # Link to local dataset
                    ds = im.load_dataset(logged_in_username, dataset_owner,
                                         dataset_name)
                    dataset_url = f"{ds.root_dir}/.git"

                # Link the dataset to the labbook
                ds = im.link_dataset_to_labbook(dataset_url, dataset_owner,
                                                dataset_name, lb)
                ds.namespace = dataset_owner

                # Preload the dataloader
                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)

                # Relink the revision
                m = Manifest(ds, logged_in_username)
                m.link_revision()
            elif action == 'unlink':
                im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb)
            elif action == 'update':
                ds = im.update_linked_dataset_reference(
                    dataset_owner, dataset_name, lb)
                m = Manifest(ds, logged_in_username)
                m.force_reload()

                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)
            else:
                raise ValueError(
                    "Unsupported action. Use `link`, `unlink`, or `update`")

            info.context.labbook_loader.prime(
                f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}",
                lb)
            edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner,
                                                       name=labbook_name),
                                          cursor=base64.b64encode(
                                              f"{0}".encode('utf-8')))

        return ModifyDatasetLink(new_labbook_edge=edge)
    def test_make_directory(self, fixture_working_dir, snapshot):
        im = InventoryManager(fixture_working_dir[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset-dir",
                               storage_type="gigantum_object_v1",
                               description="testing move")
        m = Manifest(ds, 'default')
        m.link_revision()

        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1/"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['key'] == 'test_dir1/'
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isDir'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isLocal'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['size'] == '0'

        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir,
                         "test_dir1")) is True

        query = """
                   mutation myMutation {
                     makeDatasetDirectory(input: {datasetOwner: "default", datasetName: "dataset-dir", 
                                             key: "test_dir1/test_dir2/"}) {
                         newDatasetFileEdge {
                            node {
                              id
                              key
                              isDir
                              isLocal
                              size
                            }
                         }
                     }
                   }
                   """
        result = fixture_working_dir[2].execute(query)
        assert 'errors' not in result
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['key'] == 'test_dir1/test_dir2/'
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isDir'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['isLocal'] is True
        assert result['data']['makeDatasetDirectory']['newDatasetFileEdge'][
            'node']['size'] == '0'

        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir,
                         "test_dir1")) is True
        assert os.path.isdir(
            os.path.join(m.cache_mgr.current_revision_dir, "test_dir1",
                         "test_dir2")) is True
示例#6
0
    def test_pull_objects(self, mock_config_file, mock_dataset_head):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "other_dir"))
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "fdsfgfd")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True
        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file[0]):
            with aioresponses() as mocked_responses:
                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_1,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj1_source, 'rb') as data1:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        body=data1.read(),
                        status=200,
                        content_type='application/octet-stream')

                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_2,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj2_source, 'rb') as data2:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        body=data2.read(),
                        status=200,
                        content_type='application/octet-stream')

                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test1.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is False

                decompressor = snappy.StreamDecompressor()
                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                # Download other file
                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test2.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is True

                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                with open(obj2_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj2_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1