def test_compute_push_batches(self, mock_dataset_with_manifest_bg_tests): """Test compute push batches, verifying it works OK when you've deleted some files""" ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "test content 3") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test" * 4300000) helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") helper_append_file(manifest.cache_mgr.cache_root, revision, "test4.txt", "test content 4") helper_append_file(manifest.cache_mgr.cache_root, revision, "test5.txt", "test content 5") manifest.sweep_all_changes() assert len(manifest.manifest) == 6 # remove a file from the manifest manifest.delete(['test5.txt']) assert len(manifest.manifest) == 5 key_batches, total_bytes, num_files = iom.compute_push_batches() assert num_files == 5 assert total_bytes == (4 * 4300000) + (14 * 4) assert len(key_batches) == 2 assert len(key_batches[0]) == 4 assert len(key_batches[1]) == 1 assert key_batches[1][0].dataset_path == 'test1.txt'
def _push_dataset_objects(self, logged_in_username: str, feedback_callback: Callable, access_token, id_token) -> None: """Method to schedule a push operta Args: logged_in_username: feedback_callback: access_token: id_token: Returns: """ dispatcher_obj = Dispatcher() try: self.dataset.backend.set_default_configuration( logged_in_username, access_token, id_token) m = Manifest(self.dataset, logged_in_username) iom = IOManager(self.dataset, m) obj_batches, total_bytes, num_files = iom.compute_push_batches() if obj_batches: # Schedule jobs for batches bg_jobs = list() for objs in obj_batches: job_kwargs = { 'objs': objs, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': self.dataset.namespace, 'dataset_name': self.dataset.name, 'config_file': self.dataset.client_config.config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}", 'method': 'pull_objects' } feedback_callback( f"Preparing to upload {num_files} files. Please wait..." ) job_key = dispatcher_obj.dispatch_task( method_reference=gtmcore.dispatcher.dataset_jobs. push_dataset_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundUploadJob(dispatcher_obj, objs, job_key)) logger.info( f"Schedule dataset object upload job for" f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with" f" {len(objs)} objects to upload") while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) if total_completed_bytes > 0: pc = (float(total_completed_bytes) / float(total_bytes)) * 100 feedback_callback( f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}" f" of {format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # if you get here, all jobs are done or failed. # Remove all the push files so they can be regenerated if needed for f in glob.glob(f'{iom.push_dir}/*'): os.remove(f) # Aggregate failures if they exist failure_keys: List[str] = list() for j in bg_jobs: if j.is_failed: # Background job hard failed. Assume entire batch should get re-uploaded for obj in j.objs: failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) else: for obj in j.get_failed_objects(): # Some individual objects failed failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) # Set final status for UI if len(failure_keys) == 0: feedback_callback(f"Upload complete!", percent_complete=100, has_failures=False) else: failure_str = "\n".join(failure_keys) failure_detail_str = f"Files that failed to upload:\n{failure_str}" feedback_callback("", percent_complete=100, has_failures=True, failure_detail=failure_detail_str) # Finish up by linking everything just in case iom.manifest.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information" " and try to sync again.") except Exception as err: logger.exception(err) raise