def process_linked_datasets(labbook: LabBook, logged_in_username: str) -> None: """Method to update or init any linked dataset submodule references, clean up lingering files, and schedule jobs to auto-import if needed Args: labbook: the labbook to analyze logged_in_username: the current logged in username Returns: """ im = InventoryManager(config_file=labbook.client_config.config_file) # Update linked datasets inside the Project or clean them out if needed im.update_linked_datasets(labbook, logged_in_username, init=True) # Check for linked datasets, and schedule auto-imports d = Dispatcher() datasets = im.get_linked_datasets(labbook) for ds in datasets: kwargs = { 'logged_in_username': logged_in_username, 'dataset_owner': ds.namespace, 'dataset_name': ds.name, 'remote_url': ds.remote, } metadata = { 'dataset': f"{logged_in_username}|{ds.namespace}|{ds.name}", 'method': 'dataset_jobs.check_and_import_dataset' } d.dispatch_task( gtmcore.dispatcher.dataset_jobs.check_and_import_dataset, kwargs=kwargs, metadata=metadata)
def test_fail_dependent_job(self): d = Dispatcher() job_ref_1 = d.dispatch_task(bg_jobs.test_exit_fail) job_ref_2 = d.dispatch_task(bg_jobs.test_exit_success, dependent_job=job_ref_1) time.sleep(3) assert d.query_task(job_ref_1).status == 'failed' assert d.query_task(job_ref_2).status == 'deferred'
def test_simple_dependent_job(self): d = Dispatcher() job_ref_1 = d.dispatch_task(bg_jobs.test_sleep, args=(2, )) job_ref_2 = d.dispatch_task(bg_jobs.test_exit_success, dependent_job=job_ref_1) time.sleep(0.5) assert d.query_task(job_ref_2).status == 'deferred' time.sleep(3) assert d.query_task(job_ref_1).status == 'finished' assert d.query_task(job_ref_2).status == 'finished' n = d.query_task(job_ref_1) assert n.meta.get('sample') == 'test_sleep metadata'
def test_basic_bursting(self): ws = worker.WorkerService() assert ws.is_bursting is False # Initial count of all workers( w0 = len(ws.get_all_workers(worker.QUEUE_DEFAULT)) d = Dispatcher() [d.dispatch_task(jobs.test_sleep, args=(5.2, )) for _ in range(9)] for _ in range(11): print(ws.is_bursting, len(ws.get_all_workers(worker.QUEUE_DEFAULT)), w0) if ws.is_bursting and len(ws.get_all_workers( worker.QUEUE_DEFAULT)) > w0: print('--- break') break pprint.pprint(ws.query()) time.sleep(1) else: assert False, "Expected to find worker bursting" # Wait for all BG tasks to finish. for i in range(6): if ws.is_bursting: time.sleep(1) # Assert the count of workers goes back to the original amount # when bursting is done. assert len(ws.get_all_workers(worker.QUEUE_DEFAULT)) == w0 assert ws.is_bursting is False
def mutate_and_get_payload(cls, root, info, owner, labbook_name, pull_only=False, override_method="abort", client_mutation_id=None): # Load LabBook username = get_logged_in_username() lb = InventoryManager().load_labbook(username, owner, labbook_name, author=get_logged_in_author()) # Extract valid Bearer token token = None if hasattr(info.context.headers, 'environ'): if "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token( info.context.headers.environ["HTTP_AUTHORIZATION"]) if not token: raise ValueError( "Authorization header not provided. " "Must have a valid session to query for collaborators") default_remote = lb.client_config.config['git']['default_remote'] admin_service = None for remote in lb.client_config.config['git']['remotes']: if default_remote == remote: admin_service = lb.client_config.config['git']['remotes'][ remote]['admin_service'] break if not admin_service: raise ValueError('admin_service could not be found') # Configure git creds mgr = GitLabManager(default_remote, admin_service, access_token=token) mgr.configure_git_credentials(default_remote, username) override = MergeOverride(override_method) job_metadata = {'method': 'sync_labbook', 'labbook': lb.key} job_kwargs = { 'repository': lb, 'pull_only': pull_only, 'username': username, 'override': override } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.sync_repository, kwargs=job_kwargs, metadata=job_metadata) logger.info( f"Syncing LabBook {lb.root_dir} in background job with key {job_key.key_str}" ) return SyncLabbook(job_key=job_key.key_str)
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, labbook_owner=None, labbook_name=None, client_mutation_id=None): logged_in_user = get_logged_in_username() # Schedule Job to clear file cache if dataset is no longer in use job_metadata = {'method': 'verify_dataset_contents'} job_kwargs = { 'logged_in_username': logged_in_user, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.verify_dataset_contents, metadata=job_metadata, kwargs=job_kwargs) logger.info( f"Dispatched verify_dataset_contents({dataset_owner}/{dataset_name}) to Job {job_key}" ) return VerifyDataset(background_job_key=job_key)
def test_query_finished_task(self, fixture_working_dir): """Test listing labbooks""" d = Dispatcher() job_id = d.dispatch_task(jobs.test_exit_success) time.sleep(1) query = """ { jobStatus(jobId: "%s") { result status jobMetadata failureMessage startedAt finishedAt } } """ % job_id.key_str r = fixture_working_dir[2].execute(query) assert 'errors' not in r assert int(r['data']['jobStatus']['result']) == 0 assert r['data']['jobStatus']['status'] == 'finished' assert r['data']['jobStatus']['startedAt'] is not None assert r['data']['jobStatus']['failureMessage'] is None assert r['data']['jobStatus']['finishedAt'] assert r['data']['jobStatus']['jobMetadata'] == '{}'
def mutate_and_get_payload(cls, root, info, owner, labbook_name, no_cache=False, client_mutation_id=None): username = get_logged_in_username() if BuildImage.get_container_status(labbook_name, owner, username): raise ValueError(f'Cannot build image for running container {owner}/{labbook_name}') lb = InventoryManager().load_labbook(username, owner, labbook_name, author=get_logged_in_author()) # Generate Dockerfile # TODO BVB - Move to build_image ?? ib = ImageBuilder(lb) ib.assemble_dockerfile(write=True) # Kick off building in a background thread d = Dispatcher() build_kwargs = { 'path': lb.root_dir, 'username': username, 'nocache': no_cache } metadata = {'labbook': lb.key, 'method': 'build_image'} res = d.dispatch_task(jobs.build_labbook_image, kwargs=build_kwargs, metadata=metadata) return BuildImage(environment=Environment(owner=owner, name=labbook_name), background_job_key=res.key_str)
def test_query_failed_task(self, fixture_working_dir): """Test listing labbooks""" d = Dispatcher() job_id = d.dispatch_task(jobs.test_exit_fail) time.sleep(1) query = """ { jobStatus(jobId: "%s") { result status jobMetadata failureMessage startedAt finishedAt } } """ % job_id r = fixture_working_dir[2].execute(query) assert 'errors' not in r assert r['data']['jobStatus']['result'] is None assert r['data']['jobStatus']['status'] == 'failed' assert r['data']['jobStatus']['failureMessage'] == \ 'Exception: Intentional Exception from job `test_exit_fail`' assert r['data']['jobStatus']['startedAt'] is not None assert r['data']['jobStatus']['finishedAt'] is not None # Assert the following dict is empty assert not json.loads(r['data']['jobStatus']['jobMetadata'])
def test_abort(self): d = Dispatcher() job_ref_1 = d.dispatch_task(bg_jobs.test_sleep, args=(3, )) time.sleep(1.2) assert d.query_task(job_ref_1).status == 'started' workers = rq.Worker.all(connection=d._redis_conn) wk = [w for w in workers if w.state == 'busy'] assert len(wk) == 1, "There must be precisely one busy worker" job_pid = wk[0].get_current_job().meta['pid'] d.abort_task(job_ref_1) time.sleep(0.1) j = d.query_task(job_ref_1) # There should be no result, cause it was cancelled assert j.result is None # RQ should identify the task as failed assert j.status == "failed" # Assert the JOB pid is gone with pytest.raises(OSError): os.kill(int(job_pid), 0) # Now assert the worker pid is still alive (so it can be assigned something else) worker_pid = wk[0].pid try: os.kill(int(worker_pid), 0) assert True, "Worker process is still hanging around." except OSError: assert False, "Worker process is killed"
def mutate_and_get_payload(cls, root, info, owner, labbook_name, confirm, client_mutation_id=None): username = get_logged_in_username() lb = InventoryManager().load_labbook(username, owner, labbook_name, author=get_logged_in_author()) if confirm: logger.info(f"Deleting {str(lb)}...") try: lb, stopped = ContainerOperations.stop_container( labbook=lb, username=username) except OSError as e: logger.warning(e) lb, docker_removed = ContainerOperations.delete_image( labbook=lb, username=username) if not docker_removed: raise ValueError( f'Cannot delete docker image for {str(lb)} - unable to delete Project from disk' ) datasets_to_schedule = InventoryManager().delete_labbook( username, owner, labbook_name) # Schedule jobs to clean the file cache for any linked datasets (if no other references exist) for cleanup_job in datasets_to_schedule: # Schedule Job to clear file cache if dataset is no longer in use job_metadata = {'method': 'clean_dataset_file_cache'} job_kwargs = { 'logged_in_username': username, 'dataset_owner': cleanup_job.namespace, 'dataset_name': cleanup_job.name, 'cache_location': cleanup_job.cache_root } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task( jobs.clean_dataset_file_cache, metadata=job_metadata, kwargs=job_kwargs) logger.info( f"Dispatched clean_dataset_file_cache({ cleanup_job.namespace}/{cleanup_job.name})" f" to Job {job_key}") # Verify Delete worked if os.path.exists(lb.root_dir): logger.error( f'Deleted {str(lb)} but root directory {lb.root_dir} still exists!' ) return DeleteLabbook(success=False) else: return DeleteLabbook(success=True) else: logger.info(f"Dry run in deleting {str(lb)} -- not deleted.") return DeleteLabbook(success=False)
def test_failing_task(self): d = Dispatcher() job_ref = d.dispatch_task(bg_jobs.test_exit_fail) time.sleep(1) res = d.query_task(job_ref) assert res assert res.status == 'failed' assert res.failure_message == 'Exception: Intentional Exception from job `test_exit_fail`'
def test_query_failed_tasks(self): d = Dispatcher() job_ref = d.dispatch_task(bg_jobs.test_exit_fail) time.sleep(1) assert job_ref in [j.job_key for j in d.failed_jobs] assert job_ref not in [j.job_key for j in d.finished_jobs] t = d.query_task(job_ref) assert t.failure_message == 'Exception: Intentional Exception from job `test_exit_fail`'
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, labbook_name=None, labbook_owner=None, all_keys=None, keys=None, client_mutation_id=None): logged_in_username = get_logged_in_username() lb = None im = InventoryManager() if labbook_name: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) d = Dispatcher() dl_kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'all_keys': all_keys, 'keys': keys } # Gen unique keys for tracking jobs lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if lb else None ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}" if lb_key: ds_key = f"{lb_key}|LINKED|{ds_key}" metadata = { 'dataset': ds_key, 'labbook': lb_key, 'method': 'download_dataset_files' } res = d.dispatch_task(jobs.download_dataset_files, kwargs=dl_kwargs, metadata=metadata) return DownloadDatasetFiles(background_job_key=res.key_str)
def test_get_background_jobs_basics(self, fixture_working_dir_env_repo_scoped): d = Dispatcher() time.sleep(0.25) t1 = d.dispatch_task(jobs.test_exit_fail).key_str t2 = d.dispatch_task(jobs.test_exit_success).key_str t3 = d.dispatch_task(jobs.test_sleep, args=(5,)).key_str query = """ { backgroundJobs { edges { node { id jobKey failureMessage status result } } } } """ time.sleep(1) try: time1 = time.time() result = fixture_working_dir_env_repo_scoped[2].execute(query) time2 = time.time() tdiff = time2 - time1 assert tdiff < 0.5, "Query should not take more than a few millis (took {}s)".format(tdiff) assert any([t1 == x['node']['jobKey'] and 'failed' == x['node']['status'] and 'Exception: ' in x['node']['failureMessage'] for x in result['data']['backgroundJobs']['edges']]) assert any([t2 == x['node']['jobKey'] and "finished" == x['node']['status'] and x['node']['failureMessage'] is None for x in result['data']['backgroundJobs']['edges']]) assert any([t3 == x['node']['jobKey'] and "started" == x['node']['status'] and x['node']['failureMessage'] is None for x in result['data']['backgroundJobs']['edges']]) finally: time.sleep(2)
def test_simple_task(self): d = Dispatcher() job_ref = d.dispatch_task(bg_jobs.test_exit_success) time.sleep(1) res = d.query_task(job_ref) assert res assert res.status == 'finished' assert res.result == 0 assert res.failure_message is None assert res.finished_at is not None
def mutate_and_get_payload(cls, root, info, owner, dataset_name, transaction_id, cancel=False, rollback=False, client_mutation_id=None): logged_in_username = get_logged_in_username() logged_in_author = get_logged_in_author() ds = InventoryManager().load_dataset(logged_in_username, owner, dataset_name, author=logged_in_author) if cancel and rollback: # TODO: Add ability to reset raise ValueError("Currently cannot rollback a canceled upload.") # logger.warning(f"Cancelled tx {transaction_id}, doing git reset") else: logger.info( f"Done batch upload {transaction_id}, cancelled={cancel}") if cancel: logger.warning("Sweeping aborted batch upload.") d = Dispatcher() job_kwargs = { 'logged_in_username': logged_in_username, 'logged_in_email': logged_in_author.email, 'dataset_owner': owner, 'dataset_name': dataset_name, 'dispatcher': Dispatcher } # Gen unique keys for tracking jobs metadata = { 'dataset': f"{logged_in_username}|{owner}|{dataset_name}", 'method': 'complete_dataset_upload_transaction' } res = d.dispatch_task( dataset_jobs.complete_dataset_upload_transaction, kwargs=job_kwargs, metadata=metadata) return CompleteDatasetUploadTransaction(background_job_key=res.key_str)
def mutate_and_get_payload(cls, root, info, owner, labbook_name, remote_url, client_mutation_id=None): username = get_logged_in_username() logger.info(f"Importing remote labbook from {remote_url}") lb = LabBook(author=get_logged_in_author()) default_remote = lb.client_config.config['git']['default_remote'] admin_service = None for remote in lb.client_config.config['git']['remotes']: if default_remote == remote: admin_service = lb.client_config.config['git']['remotes'][ remote]['admin_service'] break # Extract valid Bearer token if hasattr(info.context, 'headers' ) and "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token( info.context.headers.environ["HTTP_AUTHORIZATION"]) else: raise ValueError( "Authorization header not provided. Must have a valid session to query for collaborators" ) gl_mgr = GitLabManager(default_remote, admin_service=admin_service, access_token=token) gl_mgr.configure_git_credentials(default_remote, username) job_metadata = {'method': 'import_labbook_from_remote'} job_kwargs = {'remote_url': remote_url, 'username': username} dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.import_labbook_from_remote, metadata=job_metadata, kwargs=job_kwargs) logger.info( f"Dispatched import_labbook_from_remote({remote_url}) to Job {job_key}" ) return ImportRemoteLabbook(job_key=job_key.key_str)
def dispatcher_mock(self, function_ref, kwargs, metadata): assert kwargs['logged_in_username'] == 'other-test-user2' assert kwargs['dataset_owner'] == 'testuser' assert kwargs['dataset_name'] == 'test-ds' # Inject mocked config file kwargs['config_file'] = mock_config_file[0] # Stop patching so job gets scheduled for real dispatcher_patch.stop() # Call same method as in mutation d = Dispatcher() res = d.dispatch_task( gtmcore.dispatcher.dataset_jobs.check_and_import_dataset, kwargs=kwargs, metadata=metadata) return res
def mutate_and_process_upload(cls, info, upload_file_path, upload_filename, **kwargs): if not upload_file_path: logger.error('No file uploaded') raise ValueError('No file uploaded') username = get_logged_in_username() job_metadata = {'method': 'import_dataset_from_zip'} job_kwargs = { 'archive_path': upload_file_path, 'username': username, 'owner': username } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.import_dataset_from_zip, kwargs=job_kwargs, metadata=job_metadata) return ImportDataset(import_job_key=job_key.key_str)
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, labbook_name=None, labbook_owner=None, all_keys=None, keys=None, client_mutation_id=None): logged_in_username = get_logged_in_username() d = Dispatcher() dl_kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'all_keys': all_keys, 'keys': keys } # Gen unique keys for tracking jobs lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if labbook_owner else None ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}" if lb_key: ds_key = f"{lb_key}|LINKED|{ds_key}" metadata = { 'dataset': ds_key, 'labbook': lb_key, 'method': 'download_dataset_files' } res = d.dispatch_task(dataset_jobs.download_dataset_files, kwargs=dl_kwargs, metadata=metadata, persist=True) return DownloadDatasetFiles(background_job_key=res.key_str)
def dispatcher_mock(self, function_ref, kwargs, metadata): assert kwargs['logged_in_username'] == 'default' assert kwargs['logged_in_email'] == '*****@*****.**' assert kwargs['dataset_owner'] == 'default' assert kwargs['dataset_name'] == 'dataset1' # Inject mocked config file kwargs['config_file'] = mock_create_dataset[0] # Stop patching so job gets scheduled for real dispatcher_patch.stop() # Call same method as in mutation d = Dispatcher() kwargs['dispatcher'] = Dispatcher res = d.dispatch_task(gtmcore.dispatcher.dataset_jobs. complete_dataset_upload_transaction, kwargs=kwargs, metadata=metadata) return res
def mutate_and_get_payload(cls, root, info, owner, dataset_name, set_public=False, client_mutation_id=None): # Load Dataset username = get_logged_in_username() ds = InventoryManager().load_dataset(username, owner, dataset_name, author=get_logged_in_author()) # Extract valid Bearer token if "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token( info.context.headers.environ["HTTP_AUTHORIZATION"]) else: raise ValueError( "Authorization header not provided. Must have a valid session to query for collaborators" ) job_metadata = {'method': 'publish_dataset', 'dataset': ds.key} job_kwargs = { 'repository': ds, 'username': username, 'access_token': token, 'id_token': flask.g.id_token, 'public': set_public } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.publish_repository, kwargs=job_kwargs, metadata=job_metadata) logger.info( f"Publishing Dataset {ds.root_dir} in background job with key {job_key.key_str}" ) return PublishDataset(job_key=job_key.key_str)
def mutate_and_get_payload(cls, root, info, owner, dataset_name, client_mutation_id=None): username = get_logged_in_username() working_directory = Configuration().config['git']['working_directory'] ds = InventoryManager().load_dataset(username, owner, dataset_name, author=get_logged_in_author()) job_metadata = {'method': 'export_dataset_as_zip', 'dataset': ds.key} job_kwargs = { 'dataset_path': ds.root_dir, 'ds_export_directory': os.path.join(working_directory, 'export') } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.export_dataset_as_zip, kwargs=job_kwargs, metadata=job_metadata) return ExportDataset(job_key=job_key.key_str)
def test_query_started_task(self, fixture_working_dir): """Test listing labbooks""" d = Dispatcher() job_id = d.dispatch_task(jobs.test_sleep, args=(2, )) time.sleep(1) query = """ { jobStatus(jobId: "%s") { result status jobMetadata failureMessage startedAt finishedAt } } """ % job_id try: r = fixture_working_dir[2].execute(query) pprint.pprint(r) assert 'errors' not in r assert r['data']['jobStatus']['result'] is None assert r['data']['jobStatus']['status'] == 'started' assert r['data']['jobStatus']['failureMessage'] is None assert r['data']['jobStatus']['startedAt'] is not None assert json.loads( r['data']['jobStatus'] ['jobMetadata'])['sample'] == 'test_sleep metadata' finally: # Make sure all the jobs finish. time.sleep(3)
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None, config_file: str = None) -> None: """Method to download files from a dataset in the background and provide status to the UI. This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job removes any partially downloaded files (due to failures) and links all the files for the dataset. Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ dispatcher_obj = Dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) key_batches, total_bytes, num_files = iom.compute_pull_batches( keys, pull_all=all_keys) failure_keys = list() if key_batches: # Schedule jobs for batches bg_jobs = list() for keys in key_batches: job_kwargs = { 'keys': keys, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'pull_objects' } job_key = dispatcher_obj.dispatch_task( method_reference=pull_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundDownloadJob(dispatcher_obj, keys, job_key)) update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete", percent_complete=0, has_failures=False) logger.info( f"(Job {p}) Starting file downloads for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs" ) while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) pc = (float(total_completed_bytes) / float(total_bytes)) * 100 update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of " f"{format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # Aggregate failures if they exist for j in bg_jobs: if j.is_failed: # Whole job failed...assume entire batch should get re-uploaded for now failure_keys.extend(j.keys) else: failure_keys.extend(j.get_failed_keys()) # Set final status for UI if len(failure_keys) == 0: update_feedback(f"Download complete!", percent_complete=100, has_failures=False) else: failure_str = "" for f in failure_keys: # If any failed files partially downloaded, remove them. abs_dataset_path = os.path.join(m.current_revision_dir, f) abs_object_path = m.dataset_to_object_path(f) if os.path.exists(abs_dataset_path): os.remove(abs_dataset_path) if os.path.exists(abs_object_path): os.remove(abs_object_path) failure_str = f"{failure_str}{f}\n" failure_detail_str = f"Files that failed to download:\n{failure_str}" update_feedback("", has_failures=True, failure_detail=failure_detail_str) # Link dataset files, so anything that was successfully pulled will materialize m.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to download. Check message detail and try again." ) except Exception as err: logger.exception(err) raise
def run(self, key: str, database=1) -> None: """Method called in a periodically scheduled async worker that should check the dev env and manage Activity Monitor Instances as needed Args: key(str): The unique string used as the key in redis to track this DevEnvMonitor instance """ # Check if the runtime directory exists, and if not create it if not os.path.exists(os.environ['JUPYTER_RUNTIME_DIR']): os.makedirs(os.environ['JUPYTER_RUNTIME_DIR']) logger.info("Created Jupyter shared runtime dir: {}".format( os.environ['JUPYTER_RUNTIME_DIR'])) # Get list of active Activity Monitor Instances from redis redis_conn = redis.Redis(db=database) activity_monitors = redis_conn.keys( '{}:activity_monitor:*'.format(key)) activity_monitors = [x.decode('utf-8') for x in activity_monitors] # Get author info author_name = redis_conn.hget(key, "author_name").decode() author_email = redis_conn.hget(key, "author_email").decode() # Get session info from Jupyter API sessions = self.get_sessions(key, redis_conn) # Check for exited kernels for am in activity_monitors: kernel_id = redis_conn.hget(am, "kernel_id").decode() if kernel_id not in sessions: if redis_conn.hget(am, 'run').decode() != 'False': logger.info( "Detected exited JupyterLab kernel. Stopping monitoring for kernel id {}" .format(kernel_id)) # Kernel isn't running anymore. Clean up by setting run flag to `False` so worker exits redis_conn.hset(am, 'run', 'False') # TODO DC This runs again and again, persisting across dev API restarts # At a minimum, propose checking for 'run' before issuing a message (Trying this above, but that # logic doesn't work - still get messages). # But probably we should de-register the activity monitor? # Check for new kernels for s in sessions: if sessions[s]['kernel_type'] == 'notebook': # Monitor a notebook activity_monitor_key = '{}:activity_monitor:{}'.format( key, sessions[s]['kernel_id']) if activity_monitor_key not in activity_monitors: logger.info( "Detected new JupyterLab kernel. Starting monitoring for kernel id {}" .format(sessions[s]['kernel_id'])) # Start new Activity Monitor _, user, owner, labbook_name, dev_env_name = key.split(':') args = { "module_name": "gtmcore.activity.monitors.monitor_jupyterlab", "class_name": "JupyterLabNotebookMonitor", "user": user, "owner": owner, "labbook_name": labbook_name, "monitor_key": activity_monitor_key, "author_name": author_name, "author_email": author_email, "session_metadata": sessions[s] } d = Dispatcher() process_id = d.dispatch_task( jobs.start_and_run_activity_monitor, kwargs=args, persist=True) logger.info( "Started Jupyter Notebook Activity Monitor: {}".format( process_id)) # Update redis redis_conn.hset(activity_monitor_key, "dev_env_monitor", key) redis_conn.hset(activity_monitor_key, "process_id", process_id) redis_conn.hset(activity_monitor_key, "path", sessions[s]["path"]) redis_conn.hset(activity_monitor_key, "kernel_type", sessions[s]["kernel_type"]) redis_conn.hset(activity_monitor_key, "kernel_name", sessions[s]["kernel_name"]) redis_conn.hset(activity_monitor_key, "kernel_id", sessions[s]["kernel_id"]) redis_conn.hset(activity_monitor_key, "run", True)
def run(self, key: str, database=1) -> None: """Method called in a periodically scheduled async worker that should check the dev env and manage Activity Monitor Instances as needed Args: key(str): The unique string used as the key in redis to track this DevEnvMonitor instance """ redis_conn = redis.Redis(db=database) # Get a list of running activity monitors activity_monitors = self.get_activity_monitors(key, redis_conn) # Get author info # RB this is not populated until a labbook is started why running? author_name = redis_conn.hget(key, "author_name").decode() author_email = redis_conn.hget(key, "author_email").decode() # Get a list of active mitm_monitors from docker running_proxies = self.get_running_proxies() # Get a list of log files log_file_ids = self.get_log_file_ids() # check for exited instances for am in activity_monitors: if am not in running_proxies: # Kernel isn't running anymore. Clean up by setting run flag to `False` so worker exits activity_monitor_key = f'{key}:activity_monitor:{am}' redis_conn.hset(activity_monitor_key, 'run', False) logger.info( f"Detected exited RStudio Server project. Stopping monitoring for {activity_monitor_key}" ) for logfid in log_file_ids: # yes logfile, no mitmkernel -> exited kernel delete file if logfid not in running_proxies: logger.info( f"Detected defunct RStudio-Server log. Deleting log {logfid}" ) os.remove(f"/mnt/share/mitmproxy/{logfid}.rserver.dump") elif logfid not in activity_monitors: # Monitor rserver activity_monitor_key = f'{key}:activity_monitor:{logfid}' # Start new Activity Monitor _, user, owner, labbook_name, dev_env_name = key.split(':') args = { "module_name": "gtmcore.activity.monitors.monitor_rserver", "class_name": "RStudioServerMonitor", "user": user, "owner": owner, "labbook_name": labbook_name, "monitor_key": activity_monitor_key, "author_name": author_name, "author_email": author_email, # TODO DC: logfid *could* go in here... but probably a bigger refactor is needed # Also captured in https://github.com/gigantum/gigantum-client/issues/434 "session_metadata": None } d = Dispatcher() process_id = d.dispatch_task( jobs.start_and_run_activity_monitor, kwargs=args, persist=True) logger.info( f"Started RStudio Server Notebook Activity Monitor: Process {process_id}" ) # Update redis redis_conn.hset(activity_monitor_key, "dev_env_monitor", key) redis_conn.hset(activity_monitor_key, "process_id", process_id) redis_conn.hset(activity_monitor_key, "logfile_id", logfid) redis_conn.hset(activity_monitor_key, "run", True)
def test_query_complete_tasks(self): d = Dispatcher() job_ref = d.dispatch_task(bg_jobs.test_exit_success) time.sleep(1) assert job_ref in [j.job_key for j in d.finished_jobs] assert job_ref not in [j.job_key for j in d.failed_jobs]
def mutate_and_get_payload(cls, root, info, owner, dataset_name, local=False, remote=False, client_mutation_id=None): logged_in_user = get_logged_in_username() local_deleted = False remote_deleted = False if remote: logger.info(f"Deleting remote Dataset {owner}/{dataset_name}") # Extract valid Bearer token access_token = flask.g.get('access_token', None) id_token = flask.g.get('id_token', None) if not access_token or not id_token: raise ValueError( "Deleting a remote Dataset requires a valid session.") try: ds = InventoryManager().load_dataset( logged_in_user, owner, dataset_name, author=get_logged_in_author()) except InventoryException: raise ValueError( "A dataset must exist locally to delete it in the remote.") # Delete the dataset's files if supported if ds.is_managed(): ds.backend.set_default_configuration(logged_in_user, access_token, id_token) ds.backend.delete_contents(ds) # Get remote server configuration config = Configuration() remote_config = config.get_remote_configuration() # Delete the repository mgr = GitLabManager(remote_config['git_remote'], remote_config['admin_service'], access_token=access_token) mgr.remove_repository(owner, dataset_name) logger.info(f"Deleted {owner}/{dataset_name} repository from the" f" remote repository {remote_config['git_remote']}") # Call Index service to remove project from cloud index and search # Don't raise an exception if the index delete fails, since this can be handled relatively gracefully repo_id = mgr.get_repository_id(owner, dataset_name) response = requests.delete( f"https://{remote_config['index_service']}/index/{repo_id}", headers={ "Authorization": f"Bearer {access_token}", "Identity": id_token }, timeout=30) if response.status_code != 204: # Soft failure, still continue logger.error( f"Failed to remove {owner}/{dataset_name} from cloud index. " f"Status Code: {response.status_code}") logger.error(response.json()) else: logger.info( f"Deleted remote repository {owner}/{dataset_name} from cloud index" ) # Remove locally any references to that cloud repo that's just been deleted. try: ds.remove_remote() except GigantumException as e: logger.warning(e) remote_deleted = True if local: logger.info(f"Deleting local Dataset {owner}/{dataset_name}") # Delete the dataset dataset_delete_job = InventoryManager().delete_dataset( logged_in_user, owner, dataset_name) local_deleted = True # Schedule Job to clear file cache if dataset is no longer in use job_metadata = {'method': 'clean_dataset_file_cache'} job_kwargs = { 'logged_in_username': logged_in_user, 'dataset_owner': dataset_delete_job.namespace, 'dataset_name': dataset_delete_job.name, 'cache_location': dataset_delete_job.cache_root } dispatcher = Dispatcher() job_key = dispatcher.dispatch_task(jobs.clean_dataset_file_cache, metadata=job_metadata, kwargs=job_kwargs) logger.info( f"Dispatched clean_dataset_file_cache({owner}/{dataset_name}) to Job {job_key}" ) return DeleteDataset(local_deleted=local_deleted, remote_deleted=remote_deleted)