示例#1
0
def mock_dataset_with_manifest(mock_dataset_with_cache_dir):
    """A pytest fixture that creates a dataset in a temp working dir and provides a cache manager"""
    m = Manifest(mock_dataset_with_cache_dir[0], USERNAME)
    m.link_revision()

    # yield dataset, manifest, working_dir
    yield mock_dataset_with_cache_dir[0], m, mock_dataset_with_cache_dir[1]
示例#2
0
    def _put_dataset(self, path: str, username: str, owner: str) -> Dataset:
        # Validate that given path contains a dataset
        _ = self.load_dataset_from_directory(path)

        p = os.path.join(self.inventory_root, username, owner, 'datasets')
        dir_name = os.path.basename(path)
        if os.path.exists(p) and dir_name in os.listdir(p):
            raise InventoryException(
                f"Dataset directory {dir_name} already exists")

        if not os.path.exists(p):
            os.makedirs(p, exist_ok=True)

        if os.path.exists(os.path.join(p, dir_name)):
            raise InventoryException(
                f"Dataset directory {dir_name} already exists")

        final_path = shutil.move(path, p)
        assert os.path.dirname(
            final_path) != 'datasets', f"shutil.move used incorrectly"

        ds = self.load_dataset_from_directory(final_path)

        # link dataset objects
        ds.namespace = owner
        m = Manifest(ds, logged_in_username=username)
        m.link_revision()

        return ds
示例#3
0
    def test_num_files(self, fixture_single_dataset):
        """Test getting the a Dataset's file count"""
        ds = fixture_single_dataset[3]
        query = """
                    {
                      dataset(owner: "default", name: "test-dataset") {
                        overview {
                          numFiles
                        }
                      }
                    }
                    """
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview']['numFiles'] == 5

        m = Manifest(ds, 'default')
        current_revision_dir = m.cache_mgr.current_revision_dir
        shutil.rmtree(current_revision_dir)
        os.makedirs(current_revision_dir)
        m.update()

        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview']['numFiles'] == 0
示例#4
0
    def test_local_bytes(self, fixture_single_dataset):
        """Test getting the a Dataset's local_bytes"""
        ds = fixture_single_dataset[3]
        query = """
                    {
                      dataset(owner: "default", name: "test-dataset") {
                        overview {
                          localBytes
                        }
                      }
                    }
                    """
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview']['localBytes'] == '35'

        # Delete all files
        m = Manifest(ds, 'default')
        current_revision_dir = m.cache_mgr.current_revision_dir
        shutil.rmtree(current_revision_dir)
        os.makedirs(current_revision_dir)

        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview']['localBytes'] == '0'

        # Update manifest after all files have been deleted, should still be 0
        m.update()
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview']['localBytes'] == '0'
示例#5
0
    def update_linked_dataset(labbook: LabBook,
                              username: str,
                              init: bool = False) -> None:
        """

        Args:
            labbook:
            username:
            init:

        Returns:

        """
        # List all existing linked datasets IN this repository
        existing_dataset_abs_paths = glob.glob(
            os.path.join(labbook.root_dir, '.gigantum', 'datasets', "*/*"))

        if len(labbook.git.repo.submodules) > 0:
            for submodule in labbook.git.list_submodules():
                try:
                    namespace, dataset_name = submodule['name'].split("&")
                    rel_submodule_dir = os.path.join('.gigantum', 'datasets',
                                                     namespace, dataset_name)
                    submodule_dir = os.path.join(labbook.root_dir,
                                                 rel_submodule_dir)

                    # If submodule is currently present, init/update it, don't remove it!
                    if submodule_dir in existing_dataset_abs_paths:
                        existing_dataset_abs_paths.remove(submodule_dir)

                    if init:
                        # Optionally Init submodule
                        call_subprocess(
                            ['git', 'submodule', 'init', rel_submodule_dir],
                            cwd=labbook.root_dir,
                            check=True)
                    # Update submodule
                    call_subprocess(
                        ['git', 'submodule', 'update', rel_submodule_dir],
                        cwd=labbook.root_dir,
                        check=True)

                    ds = InventoryManager().load_dataset_from_directory(
                        submodule_dir)
                    ds.namespace = namespace
                    manifest = Manifest(ds, username)
                    manifest.link_revision()

                except Exception as err:
                    logger.error(
                        f"Failed to initialize linked Dataset (submodule reference): {submodule['name']}. "
                        f"This may be an actual error or simply due to repository permissions"
                    )
                    logger.exception(err)
                    continue

        # Clean out lingering dataset files if you previously had a dataset linked, but now don't
        for submodule_dir in existing_dataset_abs_paths:
            shutil.rmtree(submodule_dir)
示例#6
0
def fixture_single_dataset():
    """A pytest fixture that creates a temporary working directory, a config file to match, creates the schema,
    and populates the environment component repository.
    Class scope modifier attached
    """
    # Create temp dir
    config_file, temp_dir = _create_temp_work_dir()

    # Create user identity
    insert_cached_identity(temp_dir)

    # Create test client
    schema = graphene.Schema(query=LabbookQuery, mutation=LabbookMutations)

    # Create a bunch of lab books
    im = InventoryManager(config_file)

    ds = im.create_dataset('default',
                           'default',
                           "test-dataset",
                           storage_type="gigantum_object_v1",
                           description="Cats 2")
    m = Manifest(ds, 'default')
    cm_class = get_cache_manager_class(ds.client_config)
    cache_mgr = cm_class(ds, 'default')
    revision = ds.git.repo.head.commit.hexsha

    os.makedirs(os.path.join(cache_mgr.cache_root, revision, "other_dir"))
    helper_append_file(cache_mgr.cache_root, revision, "test1.txt", "asdfasdf")
    helper_append_file(cache_mgr.cache_root, revision, "test2.txt", "rtg")
    helper_append_file(cache_mgr.cache_root, revision, "test3.txt", "wer")
    helper_append_file(cache_mgr.cache_root, revision, "other_dir/test4.txt",
                       "dfasdfhfgjhg")
    helper_append_file(cache_mgr.cache_root, revision, "other_dir/test5.txt",
                       "fdghdfgsa")
    m.update()

    with patch.object(Configuration, 'find_default_config',
                      lambda self: config_file):
        # Load User identity into app context
        app = Flask("lmsrvlabbook")
        app.config["LABMGR_CONFIG"] = Configuration()
        app.config["LABMGR_ID_MGR"] = get_identity_manager(Configuration())

        with app.app_context():
            # within this block, current_app points to app. Set current user explicitly (this is done in the middleware)
            flask.g.user_obj = app.config["LABMGR_ID_MGR"].get_user_profile()

            # Create a test client
            client = Client(schema,
                            middleware=[DataloaderMiddleware()],
                            context_value=ContextMock())

            yield config_file, temp_dir, client, ds, cache_mgr

    # Remove the temp_dir
    shutil.rmtree(temp_dir)
示例#7
0
    def test_file_distribution_hidden(self, fixture_single_dataset):
        """"""
        ds = fixture_single_dataset[3]
        query = """
                    {
                      dataset(owner: "default", name: "test-dataset") {
                        overview {
                          fileTypeDistribution
                        }
                      }
                    }
                    """
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview'][
            'fileTypeDistribution'] == ['1.00|.txt']

        # Delete all files
        m = Manifest(ds, 'default')
        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         ".hiddendir"))
        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         ".hiddendir", "subdir"))
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test55.csv", "22222")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "df.csv", "11")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           ".hidden", "343")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "noextension", "6t4")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           ".hiddendir/tester.png", "8544")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           ".hiddendir/subdir/blah.jpeg", "8544")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           ".hiddendir/subdir/.hiddenfile", "jhg")
        m.update()

        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert len(
            result['data']['dataset']['overview']['fileTypeDistribution']) == 4
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            0] == '0.56|.txt'
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            1] == '0.22|.csv'
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            2] == '0.11|.jpeg'
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            3] == '0.11|.png'
示例#8
0
def mock_dataset_with_manifest_bg_tests(mock_config_file_background_tests):
    """A pytest fixture that creates a dataset in a temp working dir and provides a cache manager, configured with
    additional overrides for dataset tests running in the background"""
    conf_file, working_dir = mock_config_file_background_tests
    with patch.object(Configuration, 'find_default_config',
                      lambda self: conf_file):
        im = InventoryManager(conf_file)
        ds = im.create_dataset(USERNAME,
                               USERNAME,
                               'dataset-1',
                               description="my dataset 1",
                               storage_type="gigantum_object_v1")

        m = Manifest(ds, USERNAME)
        m.link_revision()

        # yield dataset, manifest, working_dir
        yield ds, m, working_dir
示例#9
0
def hash_dataset_files(logged_in_username: str,
                       dataset_owner: str,
                       dataset_name: str,
                       file_list: List,
                       config_file: str = None) -> None:
    """

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        file_list: List of files to be hashed
        config_file: Optional config file to use

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting hash_dataset_files(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        ds = InventoryManager(config_file=config_file).load_dataset(
            logged_in_username, dataset_owner, dataset_name)
        manifest = Manifest(ds, logged_in_username)

        hash_result, fast_hash_result = manifest.hash_files(file_list)

        job = get_current_job()
        if job:
            job.meta['hash_result'] = ",".join(
                ['None' if v is None else v for v in hash_result])
            job.meta['fast_hash_result'] = ",".join(
                ['None' if v is None else v for v in fast_hash_result])
            job.save_meta()

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
示例#10
0
    def test_file_info_combined(self, fixture_single_dataset):
        """Test getting the a Dataset's file info"""
        ds = fixture_single_dataset[3]
        query = """
                    {
                      dataset(owner: "default", name: "test-dataset") {
                        overview {
                          fileTypeDistribution
                          localBytes
                          totalBytes
                        }
                      }
                    }
                    """
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview'][
            'fileTypeDistribution'] == ['1.00|.txt']
        assert result['data']['dataset']['overview']['localBytes'] == '35'
        assert result['data']['dataset']['overview']['totalBytes'] == '35'

        # Delete all files
        m = Manifest(ds, 'default')
        current_revision_dir = m.cache_mgr.current_revision_dir
        shutil.rmtree(current_revision_dir)
        os.makedirs(current_revision_dir)

        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview'][
            'fileTypeDistribution'] == ['1.00|.txt']
        assert result['data']['dataset']['overview']['localBytes'] == '0'
        assert result['data']['dataset']['overview']['totalBytes'] == '35'

        m.update()
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert result['data']['dataset']['overview'][
            'fileTypeDistribution'] == []
        assert result['data']['dataset']['overview']['localBytes'] == '0'
        assert result['data']['dataset']['overview']['totalBytes'] == '0'
示例#11
0
    def _load_file_info(self, dataloader):
        """Private method to retrieve file info for a given key"""
        if not self._file_info:
            # Load file info from LabBook
            if not self.key:
                raise ValueError(
                    "Must set `key` on object creation to resolve file info")

            # Load dataset instance
            username = get_logged_in_username()
            ds = dataloader.load(f"{username}&{self.owner}&{self.name}").get()

            manifest = Manifest(ds, username)

            # Retrieve file info
            self._file_info = manifest.get(self.key)

        # Set class properties
        self.is_dir = self._file_info['is_dir']
        self.modified_at = self._file_info['modified_at']
        self.size = f"{self._file_info['size']}"
        self.is_local = self._file_info['is_local']
示例#12
0
    def test_file_distribution(self, fixture_single_dataset):
        """Test getting the a Dataset's local_bytes"""
        ds = fixture_single_dataset[3]
        query = """
                    {
                      dataset(owner: "default", name: "test-dataset") {
                        overview {
                          fileTypeDistribution
                        }
                      }
                    }
                    """
        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert len(
            result['data']['dataset']['overview']['fileTypeDistribution']) == 1
        assert result['data']['dataset']['overview'][
            'fileTypeDistribution'] == ['1.00|.txt']

        # Delete all files
        m = Manifest(ds, 'default')
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test55.csv", "22222")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "df.csv", "33333")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           ".hidden", "33333")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "noextension", "33333")
        m.update()

        result = fixture_single_dataset[2].execute(query)
        assert 'errors' not in result
        assert len(
            result['data']['dataset']['overview']['fileTypeDistribution']) == 2
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            0] == '0.71|.txt'
        assert result['data']['dataset']['overview']['fileTypeDistribution'][
            1] == '0.29|.csv'
示例#13
0
    def delete_labbook(self, username: str, owner: str,
                       labbook_name: str) -> List[DatasetCleanupJob]:
        """Delete a Labbook from this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace of the Labbook
            labbook_name: Name of the Labbook

        Returns:
            None

        """
        lb = self.load_labbook(username, owner, labbook_name)

        # Get list of datasets and cache roots to schedule for cleanup
        submodules = lb.git.list_submodules()
        datasets_to_schedule = list()
        for submodule in submodules:
            try:
                submodule_dataset_owner, submodule_dataset_name = submodule[
                    'name'].split("&")
                rel_submodule_dir = os.path.join('.gigantum', 'datasets',
                                                 submodule_dataset_owner,
                                                 submodule_dataset_name)
                submodule_dir = os.path.join(lb.root_dir, rel_submodule_dir)
                ds = self.load_dataset_from_directory(submodule_dir)
                ds.namespace = self.query_owner_of_linked_dataset(ds)
                m = Manifest(ds, username)
                datasets_to_schedule.append(
                    DatasetCleanupJob(namespace=submodule_dataset_owner,
                                      name=submodule_dataset_name,
                                      cache_root=m.cache_mgr.cache_root))
            except Exception as err:
                # Skip errors
                logger.warning(
                    f"Error occurred and ignored while processing submodules during Project delete: {err}"
                )
                continue

        # Remove labbook contents
        shutil.rmtree(lb.root_dir, ignore_errors=True)

        return datasets_to_schedule
示例#14
0
def mock_legacy_dataset(mock_dataset_with_cache_dir):
    """A pytest fixture that imports the legacy dataset"""
    archive_path = os.path.join(
        resource_filename('gtmcore.dataset.tests', 'data'),
        'test-legacy-dataset.zip')
    temp_path = os.path.join(tempfile.gettempdir(), 'test-legacy-dataset.zip')
    shutil.copyfile(archive_path, temp_path)
    conf_file = mock_dataset_with_cache_dir[0].client_config.config_file
    import_dataset_from_zip(archive_path=temp_path,
                            username=USERNAME,
                            owner=USERNAME,
                            config_file=conf_file)

    im = InventoryManager()
    ds = im.load_dataset(USERNAME, USERNAME, 'test-legacy-dataset')
    m = Manifest(ds, USERNAME)

    # yield dataset, manifest, working_dir
    yield ds, m, mock_dataset_with_cache_dir[1]
示例#15
0
    def delete_labbook(self, username: str, owner: str,
                       labbook_name: str) -> List[DatasetCleanupJob]:
        """Delete a Labbook from this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace of the Labbook
            labbook_name: Name of the Labbook

        Returns:
            None

        """
        lb = self.load_labbook(username, owner, labbook_name)

        # Get list of datasets and cache roots to schedule for cleanup
        datasets = self.get_linked_datasets(lb)
        datasets_to_schedule = list()
        for ds in datasets:
            try:
                m = Manifest(ds, username)
                if not ds.namespace:
                    raise ValueError(
                        "Dataset namespace required to schedule for cleanup")
                datasets_to_schedule.append(
                    DatasetCleanupJob(namespace=ds.namespace,
                                      name=ds.name,
                                      cache_root=m.cache_mgr.cache_root))
            except Exception as err:
                # Skip errors
                logger.warning(
                    f"Error occurred and ignored while processing submodules during Project delete: {err}"
                )
                continue

        # Delete all secrets pertaining to this project.
        SecretStore(lb, username).clear_files()

        # Remove labbook contents
        shutil.rmtree(lb.root_dir, ignore_errors=True)

        return datasets_to_schedule
示例#16
0
    def delete_dataset(self, username: str, owner: str,
                       dataset_name: str) -> DatasetCleanupJob:
        """Delete a Dataset from this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace in which to place this Dataset
            dataset_name: Name of the Datasets

        Returns:
            None

        """
        ds = self.load_dataset(username, owner, dataset_name)

        # Delete dataset contents from file cache
        m = Manifest(ds, username)

        # Delete dataset repository from working dir
        shutil.rmtree(ds.root_dir, ignore_errors=True)

        return DatasetCleanupJob(namespace=owner,
                                 name=dataset_name,
                                 cache_root=m.cache_mgr.cache_root)
示例#17
0
    def create_dataset(self,
                       username: str,
                       owner: str,
                       dataset_name: str,
                       storage_type: str,
                       description: Optional[str] = None,
                       author: Optional[GitAuthor] = None) -> Dataset:
        """Create a new Dataset in this Gigantum working directory.

        Args:
            username: Active username
            owner: Namespace in which to place this Dataset
            dataset_name: Name of the Dataset
            storage_type: String identifying the type of Dataset to instantiate
            description: Optional brief description of Dataset
            author: Optional Git Author

        Returns:
            Newly created LabBook instance

        """
        dataset = Dataset(config_file=self.config_file,
                          author=author,
                          namespace=owner)

        if storage_type not in storage.SUPPORTED_STORAGE_BACKENDS:
            raise ValueError(
                f"Unsupported Dataset storage type: {storage_type}")

        try:
            build_info = Configuration(self.config_file).config['build_info']
        except KeyError:
            logger.warning("Could not obtain build_info from config")
            build_info = None

        # Build data file contents
        dataset._data = {
            "schema": DATASET_CURRENT_SCHEMA,
            "id": uuid.uuid4().hex,
            "name": dataset_name,
            "storage_type": storage_type,
            "description": description or '',
            "created_on": datetime.datetime.utcnow().isoformat(),
            "build_info": build_info
        }
        dataset._validate_gigantum_data()

        logger.info("Creating new Dataset on disk for {}/{}/{}".format(
            username, owner, dataset_name))
        # lock while creating initial directory
        with dataset.lock(
                lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}"
        ):
            # Verify or Create user subdirectory
            # Make sure you expand a user dir string
            starting_dir = os.path.expanduser(
                dataset.client_config.config["git"]["working_directory"])
            user_dir = os.path.join(starting_dir, username)
            if not os.path.isdir(user_dir):
                os.makedirs(user_dir)

            # Create owner dir - store LabBooks in working dir > logged in user > owner
            owner_dir = os.path.join(user_dir, owner)
            if not os.path.isdir(owner_dir):
                os.makedirs(owner_dir)

                # Create `datasets` subdir in the owner dir
                owner_dir = os.path.join(owner_dir, "datasets")
            else:
                owner_dir = os.path.join(owner_dir, "datasets")

            # Verify name not already in use
            if os.path.isdir(os.path.join(owner_dir, dataset_name)):
                raise ValueError(
                    f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name"
                )

            # Create Dataset subdirectory
            new_root_dir = os.path.join(owner_dir, dataset_name)
            os.makedirs(new_root_dir)
            dataset._set_root_dir(new_root_dir)

            # Init repository
            dataset.git.initialize()

            # Create Directory Structure
            dirs = [
                'manifest', 'metadata', '.gigantum',
                os.path.join('.gigantum', 'activity'),
                os.path.join('.gigantum', 'activity', 'log')
            ]

            # Create .gitignore default file
            shutil.copyfile(
                os.path.join(resource_filename('gtmcore', 'dataset'),
                             'gitignore.default'),
                os.path.join(dataset.root_dir, ".gitignore"))

            for d in dirs:
                p = os.path.join(dataset.root_dir, d, '.gitkeep')
                os.makedirs(os.path.dirname(p), exist_ok=True)
                with open(p, 'w') as gk:
                    gk.write(
                        "This file is necessary to keep this directory tracked by Git"
                        " and archivable by compression tools. Do not delete or modify!"
                    )

            dataset._save_gigantum_data()

            # Create an empty storage.json file
            dataset.backend_config = {}

            # Commit
            dataset.git.add_all()

            # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will
            # break activity paging.
            # TODO: Improve method for detecting the first activity record
            dataset.git.commit(f"Creating new empty Dataset: {dataset_name}")

            # Create Activity Record
            adr = ActivityDetailRecord(ActivityDetailType.DATASET,
                                       show=False,
                                       importance=0)
            adr.add_value('text/plain',
                          f"Created new Dataset: {username}/{dataset_name}")
            ar = ActivityRecord(
                ActivityType.DATASET,
                message=f"Created new Dataset: {username}/{dataset_name}",
                show=True,
                importance=255,
                linked_commit=dataset.git.commit_hash)
            ar.add_detail_object(adr)
            store = ActivityStore(dataset)
            store.create_activity_record(ar)

            # Initialize file cache and link revision
            m = Manifest(dataset, username)
            m.link_revision()

            return dataset
示例#18
0
def complete_dataset_upload_transaction(logged_in_username: str,
                                        logged_in_email: str,
                                        dataset_owner: str,
                                        dataset_name: str,
                                        dispatcher,
                                        config_file: str = None) -> None:
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        logged_in_email: email for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        dispatcher: Reference to the dispatcher CLASS
        config_file: config file (used for test mocking)

    Returns:
        None
    """
    logger = LMLogger.get_logger()
    dispatcher_obj = dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None):
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    def schedule_bg_hash_job():
        """Method to check if a bg job should get scheduled and do so"""
        num_cores = manifest.get_num_hashing_cpus()
        if sum([x.is_running for x in job_list]) < num_cores:
            for j in job_list:
                if j.is_failed is True and j.failure_count < 3:
                    # Re-schedule failed job
                    job_kwargs['file_list'] = j.file_list
                    job_key = dispatcher_obj.dispatch_task(
                        hash_dataset_files,
                        kwargs=job_kwargs,
                        metadata=job_metadata)
                    j.job_key = job_key
                    update_feedback(
                        f"Restarted failed file hashing job. Re-processing"
                        f" {format_size(j.total_bytes)}...")
                    logger.info(
                        f"(Job {p}) Restarted file hash job for"
                        f" {logged_in_username}/{dataset_owner}/{dataset_name}"
                    )
                    break

                if j.is_complete is False and j.is_running is False:
                    # Schedule new job
                    job_kwargs['file_list'] = j.file_list
                    job_key = dispatcher_obj.dispatch_task(
                        hash_dataset_files,
                        kwargs=job_kwargs,
                        metadata=job_metadata)
                    j.job_key = job_key
                    logger.info(
                        f"(Job {p}) Scheduled file hash job for"
                        f" {logged_in_username}/{dataset_owner}/{dataset_name}"
                    )
                    break

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting complete_dataset_upload_transaction(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        author = GitAuthor(name=logged_in_username, email=logged_in_email)
        dispatcher_obj = Dispatcher()
        ds = InventoryManager(config_file=config_file).load_dataset(
            logged_in_username, dataset_owner, dataset_name, author=author)
        manifest = Manifest(ds, logged_in_username)

        with ds.lock():
            # Detect changes
            status = manifest.status()

            # Collect filenames that need to be hashed
            filenames = copy.deepcopy(status.modified)
            filenames.extend(status.created)

            # If there are new/updated files, spread work across cores while providing reasonable feedback
            if filenames:
                job_list = generate_bg_hash_job_list(filenames, manifest,
                                                     dispatcher_obj)
                total_bytes = sum([x.total_bytes for x in job_list])

                job_kwargs = {
                    'logged_in_username': logged_in_username,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'file_list': list(),
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'hash_dataset_files'
                }

                update_feedback(
                    f"Please wait while file contents are analyzed. "
                    f"Processing {format_size(total_bytes)}...",
                    has_failures=False)
                logger.info(
                    f"(Job {p}) Starting file hash processing for"
                    f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(job_list)} jobs"
                )

                while True:
                    # Check if you need to schedule jobs and schedule up to 1 job per iteration
                    schedule_bg_hash_job()

                    # Refresh all job statuses and update status feedback
                    completed_job_status = [
                        x.refresh_status() for x in job_list
                    ]
                    completed_bytes = sum([
                        s.total_bytes
                        for s, c in zip(job_list, completed_job_status)
                        if c is True
                    ])
                    update_feedback(
                        f"Please wait while file contents are analyzed. "
                        f"{format_size(completed_bytes)} of {format_size(total_bytes)} complete...",
                        percent_complete=(float(completed_bytes) /
                                          float(total_bytes)) * 100)

                    # Check if you are done
                    completed_or_failed = sum([(x.is_complete
                                                or (x.failure_count >= 3))
                                               for x in job_list])
                    if completed_or_failed == len(job_list):
                        break

                    # Update once per second
                    time.sleep(1)

                # Manually complete update process for updated/created files
                failed_files = list()
                for job in job_list:
                    if job.is_complete:
                        for f, h, fh in zip(job.file_list,
                                            job.get_hash_result(),
                                            job.get_fast_hash_result()):
                            if not fh or not h:
                                failed_files.append(f)
                                continue

                            _, file_bytes, mtime = fh.split("||")
                            manifest._manifest_io.add_or_update(
                                f, h, mtime, file_bytes)
                    else:
                        failed_files.extend(job.file_list)

                # Message for hard failures
                if failed_files:
                    detail_msg = f"The following files failed to hash. Try re-uploading the files again:\n"
                    detail_file_list = " \n".join(failed_files)
                    detail_msg = f"{detail_msg}{detail_file_list}"
                    update_feedback(
                        f"An error occurred while processing some files. Check details and re-upload.",
                        has_failures=True,
                        failure_detail=detail_msg)

            if status.deleted:
                manifest.hasher.delete_fast_hashes(status.deleted)
                for relative_path in status.deleted:
                    manifest._manifest_io.remove(relative_path)

            manifest._manifest_io.persist()

            # Complete sweep operation
            manifest.sweep_all_changes(status=status, upload=True)

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
示例#19
0
def download_dataset_files(logged_in_username: str,
                           access_token: str,
                           id_token: str,
                           dataset_owner: str,
                           dataset_name: str,
                           labbook_owner: Optional[str] = None,
                           labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False,
                           keys: Optional[List[str]] = None,
                           config_file: str = None) -> None:
    """Method to download files from a dataset in the background and provide status to the UI.

    This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job
    removes any partially downloaded files (due to failures) and links all the files for the dataset.

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    dispatcher_obj = Dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
            f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            keys, pull_all=all_keys)

        failure_keys = list()
        if key_batches:
            # Schedule jobs for batches
            bg_jobs = list()
            for keys in key_batches:
                job_kwargs = {
                    'keys': keys,
                    'logged_in_username': logged_in_username,
                    'access_token': access_token,
                    'id_token': id_token,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'labbook_owner': labbook_owner,
                    'labbook_name': labbook_name,
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'pull_objects'
                }

                job_key = dispatcher_obj.dispatch_task(
                    method_reference=pull_objects,
                    kwargs=job_kwargs,
                    metadata=job_metadata,
                    persist=True)
                bg_jobs.append(
                    BackgroundDownloadJob(dispatcher_obj, keys, job_key))

            update_feedback(
                f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete",
                percent_complete=0,
                has_failures=False)
            logger.info(
                f"(Job {p}) Starting file downloads for"
                f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs"
            )

            while sum([(x.is_complete or x.is_failed)
                       for x in bg_jobs]) != len(bg_jobs):
                # Refresh all job statuses and update status feedback
                [j.refresh_status() for j in bg_jobs]
                total_completed_bytes = sum(
                    [j.completed_bytes for j in bg_jobs])
                pc = (float(total_completed_bytes) / float(total_bytes)) * 100
                update_feedback(
                    f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of "
                    f"{format_size(total_bytes)}) - {round(pc)}% complete",
                    percent_complete=pc)
                time.sleep(1)

            # Aggregate failures if they exist
            for j in bg_jobs:
                if j.is_failed:
                    # Whole job failed...assume entire batch should get re-uploaded for now
                    failure_keys.extend(j.keys)
                else:
                    failure_keys.extend(j.get_failed_keys())

        # Set final status for UI
        if len(failure_keys) == 0:
            update_feedback(f"Download complete!",
                            percent_complete=100,
                            has_failures=False)
        else:
            failure_str = ""
            for f in failure_keys:
                # If any failed files partially downloaded, remove them.
                abs_dataset_path = os.path.join(m.current_revision_dir, f)
                abs_object_path = m.dataset_to_object_path(f)
                if os.path.exists(abs_dataset_path):
                    os.remove(abs_dataset_path)
                if os.path.exists(abs_object_path):
                    os.remove(abs_object_path)
                failure_str = f"{failure_str}{f}\n"

            failure_detail_str = f"Files that failed to download:\n{failure_str}"
            update_feedback("",
                            has_failures=True,
                            failure_detail=failure_detail_str)

        # Link dataset files, so anything that was successfully pulled will materialize
        m.link_revision()

        if len(failure_keys) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            raise IOError(
                f"{len(failure_keys)} file(s) failed to download. Check message detail and try again."
            )

    except Exception as err:
        logger.exception(err)
        raise
示例#20
0
def pull_objects(keys: List[str],
                 logged_in_username: str,
                 access_token: str,
                 id_token: str,
                 dataset_owner: str,
                 dataset_name: str,
                 labbook_owner: Optional[str] = None,
                 labbook_name: Optional[str] = None,
                 config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend.

    This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in
    parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled
    in the `download_dataset_files` job, which is what scheduled this job.

    Args:
        keys: List if file keys to download
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.pull_objects(keys=keys,
                                  progress_update_fn=progress_update_callback,
                                  link_revision=False)

        job = get_current_job()
        if job:
            job.meta['failure_keys'] = ",".join(
                [x.dataset_path for x in result.failure])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
示例#21
0
def push_dataset_objects(objs: List[PushObject],
                         logged_in_username: str,
                         access_token: str,
                         id_token: str,
                         dataset_owner: str,
                         dataset_name: str,
                         config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend

    Args:
        objs: List if file PushObject to push
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.push_objects(objs,
                                  progress_update_fn=progress_update_callback)

        job = get_current_job()
        if job:
            job.meta['failures'] = ",".join([
                f"{x.object_path}|{x.dataset_path}|{x.revision}"
                for x in result.failure
            ])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise