def mock_dataset_with_manifest(mock_dataset_with_cache_dir): """A pytest fixture that creates a dataset in a temp working dir and provides a cache manager""" m = Manifest(mock_dataset_with_cache_dir[0], USERNAME) m.link_revision() # yield dataset, manifest, working_dir yield mock_dataset_with_cache_dir[0], m, mock_dataset_with_cache_dir[1]
def _put_dataset(self, path: str, username: str, owner: str) -> Dataset: # Validate that given path contains a dataset _ = self.load_dataset_from_directory(path) p = os.path.join(self.inventory_root, username, owner, 'datasets') dir_name = os.path.basename(path) if os.path.exists(p) and dir_name in os.listdir(p): raise InventoryException( f"Dataset directory {dir_name} already exists") if not os.path.exists(p): os.makedirs(p, exist_ok=True) if os.path.exists(os.path.join(p, dir_name)): raise InventoryException( f"Dataset directory {dir_name} already exists") final_path = shutil.move(path, p) assert os.path.dirname( final_path) != 'datasets', f"shutil.move used incorrectly" ds = self.load_dataset_from_directory(final_path) # link dataset objects ds.namespace = owner m = Manifest(ds, logged_in_username=username) m.link_revision() return ds
def test_num_files(self, fixture_single_dataset): """Test getting the a Dataset's file count""" ds = fixture_single_dataset[3] query = """ { dataset(owner: "default", name: "test-dataset") { overview { numFiles } } } """ result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview']['numFiles'] == 5 m = Manifest(ds, 'default') current_revision_dir = m.cache_mgr.current_revision_dir shutil.rmtree(current_revision_dir) os.makedirs(current_revision_dir) m.update() result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview']['numFiles'] == 0
def test_local_bytes(self, fixture_single_dataset): """Test getting the a Dataset's local_bytes""" ds = fixture_single_dataset[3] query = """ { dataset(owner: "default", name: "test-dataset") { overview { localBytes } } } """ result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview']['localBytes'] == '35' # Delete all files m = Manifest(ds, 'default') current_revision_dir = m.cache_mgr.current_revision_dir shutil.rmtree(current_revision_dir) os.makedirs(current_revision_dir) result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview']['localBytes'] == '0' # Update manifest after all files have been deleted, should still be 0 m.update() result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview']['localBytes'] == '0'
def update_linked_dataset(labbook: LabBook, username: str, init: bool = False) -> None: """ Args: labbook: username: init: Returns: """ # List all existing linked datasets IN this repository existing_dataset_abs_paths = glob.glob( os.path.join(labbook.root_dir, '.gigantum', 'datasets', "*/*")) if len(labbook.git.repo.submodules) > 0: for submodule in labbook.git.list_submodules(): try: namespace, dataset_name = submodule['name'].split("&") rel_submodule_dir = os.path.join('.gigantum', 'datasets', namespace, dataset_name) submodule_dir = os.path.join(labbook.root_dir, rel_submodule_dir) # If submodule is currently present, init/update it, don't remove it! if submodule_dir in existing_dataset_abs_paths: existing_dataset_abs_paths.remove(submodule_dir) if init: # Optionally Init submodule call_subprocess( ['git', 'submodule', 'init', rel_submodule_dir], cwd=labbook.root_dir, check=True) # Update submodule call_subprocess( ['git', 'submodule', 'update', rel_submodule_dir], cwd=labbook.root_dir, check=True) ds = InventoryManager().load_dataset_from_directory( submodule_dir) ds.namespace = namespace manifest = Manifest(ds, username) manifest.link_revision() except Exception as err: logger.error( f"Failed to initialize linked Dataset (submodule reference): {submodule['name']}. " f"This may be an actual error or simply due to repository permissions" ) logger.exception(err) continue # Clean out lingering dataset files if you previously had a dataset linked, but now don't for submodule_dir in existing_dataset_abs_paths: shutil.rmtree(submodule_dir)
def fixture_single_dataset(): """A pytest fixture that creates a temporary working directory, a config file to match, creates the schema, and populates the environment component repository. Class scope modifier attached """ # Create temp dir config_file, temp_dir = _create_temp_work_dir() # Create user identity insert_cached_identity(temp_dir) # Create test client schema = graphene.Schema(query=LabbookQuery, mutation=LabbookMutations) # Create a bunch of lab books im = InventoryManager(config_file) ds = im.create_dataset('default', 'default', "test-dataset", storage_type="gigantum_object_v1", description="Cats 2") m = Manifest(ds, 'default') cm_class = get_cache_manager_class(ds.client_config) cache_mgr = cm_class(ds, 'default') revision = ds.git.repo.head.commit.hexsha os.makedirs(os.path.join(cache_mgr.cache_root, revision, "other_dir")) helper_append_file(cache_mgr.cache_root, revision, "test1.txt", "asdfasdf") helper_append_file(cache_mgr.cache_root, revision, "test2.txt", "rtg") helper_append_file(cache_mgr.cache_root, revision, "test3.txt", "wer") helper_append_file(cache_mgr.cache_root, revision, "other_dir/test4.txt", "dfasdfhfgjhg") helper_append_file(cache_mgr.cache_root, revision, "other_dir/test5.txt", "fdghdfgsa") m.update() with patch.object(Configuration, 'find_default_config', lambda self: config_file): # Load User identity into app context app = Flask("lmsrvlabbook") app.config["LABMGR_CONFIG"] = Configuration() app.config["LABMGR_ID_MGR"] = get_identity_manager(Configuration()) with app.app_context(): # within this block, current_app points to app. Set current user explicitly (this is done in the middleware) flask.g.user_obj = app.config["LABMGR_ID_MGR"].get_user_profile() # Create a test client client = Client(schema, middleware=[DataloaderMiddleware()], context_value=ContextMock()) yield config_file, temp_dir, client, ds, cache_mgr # Remove the temp_dir shutil.rmtree(temp_dir)
def test_file_distribution_hidden(self, fixture_single_dataset): """""" ds = fixture_single_dataset[3] query = """ { dataset(owner: "default", name: "test-dataset") { overview { fileTypeDistribution } } } """ result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview'][ 'fileTypeDistribution'] == ['1.00|.txt'] # Delete all files m = Manifest(ds, 'default') os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, ".hiddendir")) os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, ".hiddendir", "subdir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test55.csv", "22222") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "df.csv", "11") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, ".hidden", "343") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "noextension", "6t4") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, ".hiddendir/tester.png", "8544") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, ".hiddendir/subdir/blah.jpeg", "8544") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, ".hiddendir/subdir/.hiddenfile", "jhg") m.update() result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert len( result['data']['dataset']['overview']['fileTypeDistribution']) == 4 assert result['data']['dataset']['overview']['fileTypeDistribution'][ 0] == '0.56|.txt' assert result['data']['dataset']['overview']['fileTypeDistribution'][ 1] == '0.22|.csv' assert result['data']['dataset']['overview']['fileTypeDistribution'][ 2] == '0.11|.jpeg' assert result['data']['dataset']['overview']['fileTypeDistribution'][ 3] == '0.11|.png'
def mock_dataset_with_manifest_bg_tests(mock_config_file_background_tests): """A pytest fixture that creates a dataset in a temp working dir and provides a cache manager, configured with additional overrides for dataset tests running in the background""" conf_file, working_dir = mock_config_file_background_tests with patch.object(Configuration, 'find_default_config', lambda self: conf_file): im = InventoryManager(conf_file) ds = im.create_dataset(USERNAME, USERNAME, 'dataset-1', description="my dataset 1", storage_type="gigantum_object_v1") m = Manifest(ds, USERNAME) m.link_revision() # yield dataset, manifest, working_dir yield ds, m, working_dir
def hash_dataset_files(logged_in_username: str, dataset_owner: str, dataset_name: str, file_list: List, config_file: str = None) -> None: """ Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked file_list: List of files to be hashed config_file: Optional config file to use Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting hash_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") ds = InventoryManager(config_file=config_file).load_dataset( logged_in_username, dataset_owner, dataset_name) manifest = Manifest(ds, logged_in_username) hash_result, fast_hash_result = manifest.hash_files(file_list) job = get_current_job() if job: job.meta['hash_result'] = ",".join( ['None' if v is None else v for v in hash_result]) job.meta['fast_hash_result'] = ",".join( ['None' if v is None else v for v in fast_hash_result]) job.save_meta() except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def test_file_info_combined(self, fixture_single_dataset): """Test getting the a Dataset's file info""" ds = fixture_single_dataset[3] query = """ { dataset(owner: "default", name: "test-dataset") { overview { fileTypeDistribution localBytes totalBytes } } } """ result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview'][ 'fileTypeDistribution'] == ['1.00|.txt'] assert result['data']['dataset']['overview']['localBytes'] == '35' assert result['data']['dataset']['overview']['totalBytes'] == '35' # Delete all files m = Manifest(ds, 'default') current_revision_dir = m.cache_mgr.current_revision_dir shutil.rmtree(current_revision_dir) os.makedirs(current_revision_dir) result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview'][ 'fileTypeDistribution'] == ['1.00|.txt'] assert result['data']['dataset']['overview']['localBytes'] == '0' assert result['data']['dataset']['overview']['totalBytes'] == '35' m.update() result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert result['data']['dataset']['overview'][ 'fileTypeDistribution'] == [] assert result['data']['dataset']['overview']['localBytes'] == '0' assert result['data']['dataset']['overview']['totalBytes'] == '0'
def _load_file_info(self, dataloader): """Private method to retrieve file info for a given key""" if not self._file_info: # Load file info from LabBook if not self.key: raise ValueError( "Must set `key` on object creation to resolve file info") # Load dataset instance username = get_logged_in_username() ds = dataloader.load(f"{username}&{self.owner}&{self.name}").get() manifest = Manifest(ds, username) # Retrieve file info self._file_info = manifest.get(self.key) # Set class properties self.is_dir = self._file_info['is_dir'] self.modified_at = self._file_info['modified_at'] self.size = f"{self._file_info['size']}" self.is_local = self._file_info['is_local']
def test_file_distribution(self, fixture_single_dataset): """Test getting the a Dataset's local_bytes""" ds = fixture_single_dataset[3] query = """ { dataset(owner: "default", name: "test-dataset") { overview { fileTypeDistribution } } } """ result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert len( result['data']['dataset']['overview']['fileTypeDistribution']) == 1 assert result['data']['dataset']['overview'][ 'fileTypeDistribution'] == ['1.00|.txt'] # Delete all files m = Manifest(ds, 'default') helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test55.csv", "22222") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "df.csv", "33333") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, ".hidden", "33333") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "noextension", "33333") m.update() result = fixture_single_dataset[2].execute(query) assert 'errors' not in result assert len( result['data']['dataset']['overview']['fileTypeDistribution']) == 2 assert result['data']['dataset']['overview']['fileTypeDistribution'][ 0] == '0.71|.txt' assert result['data']['dataset']['overview']['fileTypeDistribution'][ 1] == '0.29|.csv'
def delete_labbook(self, username: str, owner: str, labbook_name: str) -> List[DatasetCleanupJob]: """Delete a Labbook from this Gigantum working directory. Args: username: Active username owner: Namespace of the Labbook labbook_name: Name of the Labbook Returns: None """ lb = self.load_labbook(username, owner, labbook_name) # Get list of datasets and cache roots to schedule for cleanup submodules = lb.git.list_submodules() datasets_to_schedule = list() for submodule in submodules: try: submodule_dataset_owner, submodule_dataset_name = submodule[ 'name'].split("&") rel_submodule_dir = os.path.join('.gigantum', 'datasets', submodule_dataset_owner, submodule_dataset_name) submodule_dir = os.path.join(lb.root_dir, rel_submodule_dir) ds = self.load_dataset_from_directory(submodule_dir) ds.namespace = self.query_owner_of_linked_dataset(ds) m = Manifest(ds, username) datasets_to_schedule.append( DatasetCleanupJob(namespace=submodule_dataset_owner, name=submodule_dataset_name, cache_root=m.cache_mgr.cache_root)) except Exception as err: # Skip errors logger.warning( f"Error occurred and ignored while processing submodules during Project delete: {err}" ) continue # Remove labbook contents shutil.rmtree(lb.root_dir, ignore_errors=True) return datasets_to_schedule
def mock_legacy_dataset(mock_dataset_with_cache_dir): """A pytest fixture that imports the legacy dataset""" archive_path = os.path.join( resource_filename('gtmcore.dataset.tests', 'data'), 'test-legacy-dataset.zip') temp_path = os.path.join(tempfile.gettempdir(), 'test-legacy-dataset.zip') shutil.copyfile(archive_path, temp_path) conf_file = mock_dataset_with_cache_dir[0].client_config.config_file import_dataset_from_zip(archive_path=temp_path, username=USERNAME, owner=USERNAME, config_file=conf_file) im = InventoryManager() ds = im.load_dataset(USERNAME, USERNAME, 'test-legacy-dataset') m = Manifest(ds, USERNAME) # yield dataset, manifest, working_dir yield ds, m, mock_dataset_with_cache_dir[1]
def delete_labbook(self, username: str, owner: str, labbook_name: str) -> List[DatasetCleanupJob]: """Delete a Labbook from this Gigantum working directory. Args: username: Active username owner: Namespace of the Labbook labbook_name: Name of the Labbook Returns: None """ lb = self.load_labbook(username, owner, labbook_name) # Get list of datasets and cache roots to schedule for cleanup datasets = self.get_linked_datasets(lb) datasets_to_schedule = list() for ds in datasets: try: m = Manifest(ds, username) if not ds.namespace: raise ValueError( "Dataset namespace required to schedule for cleanup") datasets_to_schedule.append( DatasetCleanupJob(namespace=ds.namespace, name=ds.name, cache_root=m.cache_mgr.cache_root)) except Exception as err: # Skip errors logger.warning( f"Error occurred and ignored while processing submodules during Project delete: {err}" ) continue # Delete all secrets pertaining to this project. SecretStore(lb, username).clear_files() # Remove labbook contents shutil.rmtree(lb.root_dir, ignore_errors=True) return datasets_to_schedule
def delete_dataset(self, username: str, owner: str, dataset_name: str) -> DatasetCleanupJob: """Delete a Dataset from this Gigantum working directory. Args: username: Active username owner: Namespace in which to place this Dataset dataset_name: Name of the Datasets Returns: None """ ds = self.load_dataset(username, owner, dataset_name) # Delete dataset contents from file cache m = Manifest(ds, username) # Delete dataset repository from working dir shutil.rmtree(ds.root_dir, ignore_errors=True) return DatasetCleanupJob(namespace=owner, name=dataset_name, cache_root=m.cache_mgr.cache_root)
def create_dataset(self, username: str, owner: str, dataset_name: str, storage_type: str, description: Optional[str] = None, author: Optional[GitAuthor] = None) -> Dataset: """Create a new Dataset in this Gigantum working directory. Args: username: Active username owner: Namespace in which to place this Dataset dataset_name: Name of the Dataset storage_type: String identifying the type of Dataset to instantiate description: Optional brief description of Dataset author: Optional Git Author Returns: Newly created LabBook instance """ dataset = Dataset(config_file=self.config_file, author=author, namespace=owner) if storage_type not in storage.SUPPORTED_STORAGE_BACKENDS: raise ValueError( f"Unsupported Dataset storage type: {storage_type}") try: build_info = Configuration(self.config_file).config['build_info'] except KeyError: logger.warning("Could not obtain build_info from config") build_info = None # Build data file contents dataset._data = { "schema": DATASET_CURRENT_SCHEMA, "id": uuid.uuid4().hex, "name": dataset_name, "storage_type": storage_type, "description": description or '', "created_on": datetime.datetime.utcnow().isoformat(), "build_info": build_info } dataset._validate_gigantum_data() logger.info("Creating new Dataset on disk for {}/{}/{}".format( username, owner, dataset_name)) # lock while creating initial directory with dataset.lock( lock_key=f"new_dataset_lock|{username}|{owner}|{dataset_name}" ): # Verify or Create user subdirectory # Make sure you expand a user dir string starting_dir = os.path.expanduser( dataset.client_config.config["git"]["working_directory"]) user_dir = os.path.join(starting_dir, username) if not os.path.isdir(user_dir): os.makedirs(user_dir) # Create owner dir - store LabBooks in working dir > logged in user > owner owner_dir = os.path.join(user_dir, owner) if not os.path.isdir(owner_dir): os.makedirs(owner_dir) # Create `datasets` subdir in the owner dir owner_dir = os.path.join(owner_dir, "datasets") else: owner_dir = os.path.join(owner_dir, "datasets") # Verify name not already in use if os.path.isdir(os.path.join(owner_dir, dataset_name)): raise ValueError( f"Dataset `{dataset_name}` already exists locally. Choose a new Dataset name" ) # Create Dataset subdirectory new_root_dir = os.path.join(owner_dir, dataset_name) os.makedirs(new_root_dir) dataset._set_root_dir(new_root_dir) # Init repository dataset.git.initialize() # Create Directory Structure dirs = [ 'manifest', 'metadata', '.gigantum', os.path.join('.gigantum', 'activity'), os.path.join('.gigantum', 'activity', 'log') ] # Create .gitignore default file shutil.copyfile( os.path.join(resource_filename('gtmcore', 'dataset'), 'gitignore.default'), os.path.join(dataset.root_dir, ".gitignore")) for d in dirs: p = os.path.join(dataset.root_dir, d, '.gitkeep') os.makedirs(os.path.dirname(p), exist_ok=True) with open(p, 'w') as gk: gk.write( "This file is necessary to keep this directory tracked by Git" " and archivable by compression tools. Do not delete or modify!" ) dataset._save_gigantum_data() # Create an empty storage.json file dataset.backend_config = {} # Commit dataset.git.add_all() # NOTE: this string is used to indicate there are no more activity records to get. Changing the string will # break activity paging. # TODO: Improve method for detecting the first activity record dataset.git.commit(f"Creating new empty Dataset: {dataset_name}") # Create Activity Record adr = ActivityDetailRecord(ActivityDetailType.DATASET, show=False, importance=0) adr.add_value('text/plain', f"Created new Dataset: {username}/{dataset_name}") ar = ActivityRecord( ActivityType.DATASET, message=f"Created new Dataset: {username}/{dataset_name}", show=True, importance=255, linked_commit=dataset.git.commit_hash) ar.add_detail_object(adr) store = ActivityStore(dataset) store.create_activity_record(ar) # Initialize file cache and link revision m = Manifest(dataset, username) m.link_revision() return dataset
def complete_dataset_upload_transaction(logged_in_username: str, logged_in_email: str, dataset_owner: str, dataset_name: str, dispatcher, config_file: str = None) -> None: """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user logged_in_email: email for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked dispatcher: Reference to the dispatcher CLASS config_file: config file (used for test mocking) Returns: None """ logger = LMLogger.get_logger() dispatcher_obj = dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() def schedule_bg_hash_job(): """Method to check if a bg job should get scheduled and do so""" num_cores = manifest.get_num_hashing_cpus() if sum([x.is_running for x in job_list]) < num_cores: for j in job_list: if j.is_failed is True and j.failure_count < 3: # Re-schedule failed job job_kwargs['file_list'] = j.file_list job_key = dispatcher_obj.dispatch_task( hash_dataset_files, kwargs=job_kwargs, metadata=job_metadata) j.job_key = job_key update_feedback( f"Restarted failed file hashing job. Re-processing" f" {format_size(j.total_bytes)}...") logger.info( f"(Job {p}) Restarted file hash job for" f" {logged_in_username}/{dataset_owner}/{dataset_name}" ) break if j.is_complete is False and j.is_running is False: # Schedule new job job_kwargs['file_list'] = j.file_list job_key = dispatcher_obj.dispatch_task( hash_dataset_files, kwargs=job_kwargs, metadata=job_metadata) j.job_key = job_key logger.info( f"(Job {p}) Scheduled file hash job for" f" {logged_in_username}/{dataset_owner}/{dataset_name}" ) break p = os.getpid() try: logger.info( f"(Job {p}) Starting complete_dataset_upload_transaction(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") author = GitAuthor(name=logged_in_username, email=logged_in_email) dispatcher_obj = Dispatcher() ds = InventoryManager(config_file=config_file).load_dataset( logged_in_username, dataset_owner, dataset_name, author=author) manifest = Manifest(ds, logged_in_username) with ds.lock(): # Detect changes status = manifest.status() # Collect filenames that need to be hashed filenames = copy.deepcopy(status.modified) filenames.extend(status.created) # If there are new/updated files, spread work across cores while providing reasonable feedback if filenames: job_list = generate_bg_hash_job_list(filenames, manifest, dispatcher_obj) total_bytes = sum([x.total_bytes for x in job_list]) job_kwargs = { 'logged_in_username': logged_in_username, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'file_list': list(), 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'hash_dataset_files' } update_feedback( f"Please wait while file contents are analyzed. " f"Processing {format_size(total_bytes)}...", has_failures=False) logger.info( f"(Job {p}) Starting file hash processing for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(job_list)} jobs" ) while True: # Check if you need to schedule jobs and schedule up to 1 job per iteration schedule_bg_hash_job() # Refresh all job statuses and update status feedback completed_job_status = [ x.refresh_status() for x in job_list ] completed_bytes = sum([ s.total_bytes for s, c in zip(job_list, completed_job_status) if c is True ]) update_feedback( f"Please wait while file contents are analyzed. " f"{format_size(completed_bytes)} of {format_size(total_bytes)} complete...", percent_complete=(float(completed_bytes) / float(total_bytes)) * 100) # Check if you are done completed_or_failed = sum([(x.is_complete or (x.failure_count >= 3)) for x in job_list]) if completed_or_failed == len(job_list): break # Update once per second time.sleep(1) # Manually complete update process for updated/created files failed_files = list() for job in job_list: if job.is_complete: for f, h, fh in zip(job.file_list, job.get_hash_result(), job.get_fast_hash_result()): if not fh or not h: failed_files.append(f) continue _, file_bytes, mtime = fh.split("||") manifest._manifest_io.add_or_update( f, h, mtime, file_bytes) else: failed_files.extend(job.file_list) # Message for hard failures if failed_files: detail_msg = f"The following files failed to hash. Try re-uploading the files again:\n" detail_file_list = " \n".join(failed_files) detail_msg = f"{detail_msg}{detail_file_list}" update_feedback( f"An error occurred while processing some files. Check details and re-upload.", has_failures=True, failure_detail=detail_msg) if status.deleted: manifest.hasher.delete_fast_hashes(status.deleted) for relative_path in status.deleted: manifest._manifest_io.remove(relative_path) manifest._manifest_io.persist() # Complete sweep operation manifest.sweep_all_changes(status=status, upload=True) except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None, config_file: str = None) -> None: """Method to download files from a dataset in the background and provide status to the UI. This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job removes any partially downloaded files (due to failures) and links all the files for the dataset. Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ dispatcher_obj = Dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) key_batches, total_bytes, num_files = iom.compute_pull_batches( keys, pull_all=all_keys) failure_keys = list() if key_batches: # Schedule jobs for batches bg_jobs = list() for keys in key_batches: job_kwargs = { 'keys': keys, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'pull_objects' } job_key = dispatcher_obj.dispatch_task( method_reference=pull_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundDownloadJob(dispatcher_obj, keys, job_key)) update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete", percent_complete=0, has_failures=False) logger.info( f"(Job {p}) Starting file downloads for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs" ) while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) pc = (float(total_completed_bytes) / float(total_bytes)) * 100 update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of " f"{format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # Aggregate failures if they exist for j in bg_jobs: if j.is_failed: # Whole job failed...assume entire batch should get re-uploaded for now failure_keys.extend(j.keys) else: failure_keys.extend(j.get_failed_keys()) # Set final status for UI if len(failure_keys) == 0: update_feedback(f"Download complete!", percent_complete=100, has_failures=False) else: failure_str = "" for f in failure_keys: # If any failed files partially downloaded, remove them. abs_dataset_path = os.path.join(m.current_revision_dir, f) abs_object_path = m.dataset_to_object_path(f) if os.path.exists(abs_dataset_path): os.remove(abs_dataset_path) if os.path.exists(abs_object_path): os.remove(abs_object_path) failure_str = f"{failure_str}{f}\n" failure_detail_str = f"Files that failed to download:\n{failure_str}" update_feedback("", has_failures=True, failure_detail=failure_detail_str) # Link dataset files, so anything that was successfully pulled will materialize m.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to download. Check message detail and try again." ) except Exception as err: logger.exception(err) raise
def pull_objects(keys: List[str], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend. This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled in the `download_dataset_files` job, which is what scheduled this job. Args: keys: List if file keys to download logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.pull_objects(keys=keys, progress_update_fn=progress_update_callback, link_revision=False) job = get_current_job() if job: job.meta['failure_keys'] = ",".join( [x.dataset_path for x in result.failure]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def push_dataset_objects(objs: List[PushObject], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend Args: objs: List if file PushObject to push logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.push_objects(objs, progress_update_fn=progress_update_callback) job = get_current_job() if job: job.meta['failures'] = ",".join([ f"{x.object_path}|{x.dataset_path}|{x.revision}" for x in result.failure ]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise