def pull_objects(keys: List[str], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend. This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled in the `download_dataset_files` job, which is what scheduled this job. Args: keys: List if file keys to download logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.pull_objects(keys=keys, progress_update_fn=progress_update_callback, link_revision=False) job = get_current_job() if job: job.meta['failure_keys'] = ",".join( [x.dataset_path for x in result.failure]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def test_pull_objects(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_objects(keys=["test1.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[0].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() result = iom.pull_objects(keys=["test2.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[1].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() with open(obj2_target, 'rt') as dd: assert "test content 2" == dd.read()
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None): """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) if all_keys: result = iom.pull_all(status_update_fn=update_meta) elif keys: result = iom.pull_objects(keys=keys, status_update_fn=update_meta) else: raise ValueError("Must provide a list of keys or set all_keys=True") # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed job = get_current_job() if job: job.meta['success_keys'] = [x.dataset_path for x in result.success] job.meta['failure_keys'] = [x.dataset_path for x in result.failure] job.save_meta() if len(result.failure) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error sys.exit(-1) except Exception as err: logger.exception(err) raise