def test_none_product_patterns(mongodb_settings, agave, pipelinejobs_config, pipeline_uuid, random_dir_name, admin_token): """Confirm that product_patterns can be set to None without failure """ archive_path = '/sample/tacc-cloud/' + random_string(32) mpj = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path=archive_path, experiment_id='experiment.tacc.10001', product_patterns=None, archive_patterns=[]).setup() mpj.cancel(token=admin_token)
def test_no_archive_and_product_patterns(mongodb_settings, agave, pipelinejobs_config, pipeline_uuid, random_dir_name, admin_token): """Confirm that archive_patterns and product_patterns can be entirely un- specified without failure """ archive_path = '/sample/tacc-cloud/' + random_dir_name mpj = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path=archive_path, experiment_id='experiment.tacc.10001').setup() mpj.cancel(token=admin_token)
def client_w_sample_archive_path(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid, admin_token): mpj = ManagedPipelineJob( mongodb_settings, pipelinejobs_config, agave=agave, archive_path='/sample/tacc-cloud', experiment_id='experiment.tacc.10001', archive_patterns=[{ 'patterns': ['.json$'], 'level': '2' }], product_patterns=[{ 'patterns': ['.json$'], 'derived_using': [ '1092d775-0f7c-5b4d-970f-e739711d5f36', 'modified_ecoli_MG1655_refFlat_txt-0-1-0' ], 'derived_from': [ '105fb204-530b-5915-9fd6-caf88ca9ad8a', '1058868c-340e-5d8c-b66e-9739cbcf8d36', './672.png', 'agave://data-sd2e-community/sample/tacc-cloud/dawnofman.jpg' ] }]) return mpj
def client_w_param_data(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid, experiment_id, job_data): return ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, experiment_id=experiment_id, data=job_data)
def instanced_client_w_param(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid, experiment_id): return ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, experiment_id=experiment_id, instanced=True)
def test_pipejob_init_archive_path_custom(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): """Checks that passing archive_path=<val> overrides generated value """ base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path='/products/v2/test123') assert base.archive_path.startswith('/products/v2') assert base.archive_path.endswith('test123')
def client_w_sample_archive_path_missed_ref(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid, admin_token): mpj = ManagedPipelineJob( mongodb_settings, pipelinejobs_config, agave=agave, archive_path='/sample/tacc-cloud', experiment_id='experiment.tacc.10001', archive_patterns=[{ 'patterns': ['.json$'], 'level': '2' }], product_patterns=[{ 'patterns': ['.json$'], 'derived_using': [ '1092d775-0f7c-5b4d-970f-e739711d5f36', 'modified_ecoli_MG1655_refFlat_txt-0-1-0' ], 'derived_from': [ '105fb204-530b-5915-9fd6-caf88ca9ad8a', '1058868c-340e-5d8c-b66e-9739cbcf8d36', './672.png', 'agave://data-sd2e-community/sample/tacc-cloud/dawnofman.jpg' ] }]) def initjob(): mpj.setup() mpj.run(token=admin_token) mpj.finish(token=admin_token) return mpj job = None try: job = initjob() except ManagedPipelineJobError: mpj.reset(token=admin_token, no_clear_path=True, permissive=True) job = initjob() # print('MPJ.UUID', job.uuid) return job
def test_uuid_bypass_invalid_metadata(mongodb_settings, agave, pipelinejobs_config, pipeline_uuid, random_dir_name, admin_token): """Confirm that passing a UUID directly to ManagedPipelineJob metadata binding stage bypasses identifier resolution""" archive_path = '/sample/tacc-cloud/' + random_string(32) ident = typeduuid.catalog_uuid('ThisCanNeverEverEverWork', 'experiment') mpj = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path=archive_path, experiment_id=ident, product_patterns=[], archive_patterns=[]).setup()
def test_invalid_metadata_child_of(mongodb_settings, agave, pipelinejobs_config, pipeline_uuid, random_dir_name, admin_token): """Confirm that passing an unknown identifier not in the database will cause ManagedPipelineJob initialization to raise an Exception """ archive_path = '/sample/tacc-cloud/' + random_string(32) with pytest.raises(ValueError): mpj = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path=archive_path, experiment_id='ThisCanNeverEverEverWork', product_patterns=[], archive_patterns=[]).setup()
def test_pipejob_inputs_list(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): inputs = [ 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs' ] base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, inputs=inputs, experiment_id='experiment.ginkgo.10001') # Only the two inputs and the first parameter have resolvable UUID in the test data set # The experiment_id will resolve as well assert len(base.acted_on) == 2
def test_pipejob_data_parameters_resolve(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): data = { 'parameters': { 'param1': 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'param2': 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs' } } base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, data=data) # Only the two inputs and the first parameter have resolvable UUID in the test data set assert len(base.acted_on) == 2
def test_pipejob_data_inputs_list_resolve(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): data = { 'inputs': [ 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs', '/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs' ] } base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, data=data) # Only the first two inputs resolvable since the list context expects fully-qualified URIs assert len(base.acted_on) == 2
def test_pipejob_inputs_expt_id(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid, recid, raises_exception): inputs = [ 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs' ] base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, inputs=inputs, experiment_id=recid) # Only the two inputs and the first parameter have resolvable UUID in the test data set assert len(base.child_of) > 0 # The hashid value for measurement.tacc.0x00000000, the default # Should NOT be present if we resolve measurements assert '5pQxBkRrRe2GEPOWWZBq4LNQ' not in base._archive_path_els assert '/106' in base.archive_path
def test_pipejob_data_params_refs_resolve(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): data = { 'parameters': { 'structure': 'https://www.rcsb.org/structure/6N0V', 'protein': 'https://www.uniprot.org/uniprot/G0S6G2' } } base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, data=data) # ^^ These references should be present in the database if test_stores_reference has been run assert len(base.acted_on) == 0 assert len(base.acted_using) == 2
def test_pipejob_inputs_no_link_or_data(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): inputs = [ 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs' ] base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, inputs=inputs) # Only the two inputs and the first parameter have resolvable UUID in the test data set assert len(base.child_of) > 0 # The hashid value for measurement.tacc.0x00000000, the default assert '5pQxBkRrRe2GEPOWWZBq4LNQ' in base._archive_path_els # The hashid value for an empty "data" dictionary assert 'PAVpwrObxp5YjYRvrJOd5yVp' in base._archive_path_els # Pipeline ID is exposed in the path because there is a 1:1 mapping assert '/106' in base.archive_path
def test_pipejob_agave_uri_from_data(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): data = { 'inputs': [ 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs', 'agave://data-sd2e-community/products/v1/41e1dec1-2940-5b04-bd9e-54af78f30774/aaf646a5-7c05-5ab3-a144-5563fca6830d/a4609424-508a-555c-9720-5ee3df44e777/whole-shrew-20181207T220030Z/output/output.csv' ], 'parameters': { 'p1': 'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs', 'p2': '/uploads/456.txt' } } base = ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, data=data) # Only the two inputs and the first parameter have resolvable UUID in the test data set assert len(base.acted_on) == 3
def main(): rx = Reactor() mes = AttrDict(rx.context.message_dict) mongodb_conn = rx.settings.mongodb # ! This code fixes an edge case in JSON serialization if mes == {}: try: jsonmsg = json.loads(rx.context.raw_message) mes = AttrDict(jsonmsg) except Exception as exc: rx.on_failure('Failed to load JSON from message', exc) # Check incoming message against the default JSON schema try: rx.validate_message(mes, permissive=False) except Exception as exc: rx.on_failure('Failed to validate message to schema', exc) # Verify appId is known to Agave apps API. Requires the invoking # user has a tenant admin role unless the appId is public agave_job = mes.get('job_definition') agave_appid = agave_job.get('appId') agave_app_details = None job_params = mes.get('parameters') instanced_archive_path = mes.get('instanced', True) rx.logger.info( 'Received request to manage execution of {}'.format(agave_appid)) try: agave_app_details = rx.client.apps.get(appId=agave_appid) except HTTPError as http_err: rx.on_failure( '{} is not a known Agave application'.format( agave_appid), http_err) except Exception as generic_exception: rx.on_failure( 'Failed to look up Agave application', generic_exception) # Look up the Pipeline record for this Agave appId. # # Note that this requires a convention where the standalone Agave app is # registered in the Pipelines system with pipeline.id == agave.app.id pipeline_uuid = None try: manager_stores = Manager.init_stores(mongodb_conn) pipeline_rec = manager_stores['pipeline'].find_one_by_id(id=agave_appid) if pipeline_rec is None: raise ValueError("No 'pipelines' record found in database") else: pipeline_uuid = pipeline_rec.get('uuid') except Exception as generic_exception: rx.on_failure('Failed to resolve appId {} to a Pipeline record'.format( agave_appid), generic_exception) def cancel_job(message='an error occurred', exception=None): """Helper function to cancel a failed job """ fmt_message = 'PipelineJob {} canceled because {}'.format( job_uuid, message) try: job.cancel() except Exception as job_cancel_exception: rx.logger.warning( 'Failed to cancel PipelineJob {} because {}'.format( job_uuid, job_cancel_exception)) rx.on_failure(fmt_message, exception) def fail_job(message='an error occurred', exception=None): """Helper function to fail a job """ fmt_message = 'PipelineJob {} failed because {}'.format( job_uuid, message) try: job.fail(data={'message': message}) except Exception as job_fail_exception: rx.logger.warning( 'Unable to update PipelineJob state for {} because {}'.format( job_uuid, job_fail_exception)) rx.on_failure(fmt_message, exception) # Initialize the ManagedPipelineJob. It will be in the jobs collection # with a status of CREATED. job = None job_uuid = None rx.logger.info('Building initial job.data') init_data = agave_job mes_data = mes.get('data', {}) for k, v in job_params.items(): if v is not None and isinstance(v, str): init_data[k] = v # init_data = {**init_data, **mes_data} try: job = ManagedPipelineJob(rx.settings.mongodb, rx.settings.pipelines.job_manager_id, rx.settings.pipelines.updates_nonce, pipeline_uuid=pipeline_uuid, data=init_data, session=rx.nickname, agent=rx.uid, task=rx.execid, instanced=instanced_archive_path, archive_path_patterns=mes.get( 'index_patterns', []), ** job_params ) job.setup(mes_data) job_uuid = job.uuid except Exception as generic_exception: if job is not None: cancel_job(message='Failed to set up ManagedPipelineJob', exception=generic_exception) else: rx.on_failure('Failed to set up ManagedPipelineJob', generic_exception) # Extend the incoming Agave job definition to update the PipelineJob. # Set the archivePath and archiveSystem from the ManagedPipelineJob # # The former is accomplished by adding custom notifications built from # the job's 'callback' property, which was initialized on job.setup(). Any # pre-existing notifications (email, other callbacks) are preserved. try: if 'notifications' not in agave_job: agave_job['notifications'] = list() # for event in ('SUBMITTING', 'STAGING_JOB', 'RUNNING', 'ARCHIVING', 'ARCHIVING_FINIS', 'FINISHED', 'FAILED'): # Capture all Agave job states notification = {'event': '*', 'persistent': True, 'url': job.callback + '&status=${STATUS}¬e=${JOB_ERROR}'} agave_job['notifications'].append(notification) notification = {'event': 'FINISHED', 'persistent': False, 'url': job.indexer_callback} agave_job['notifications'].append(notification) agave_job['archiveSystem'] = job.archive_system agave_job['archivePath'] = job.archive_path agave_job['archive'] = True except Exception as generic_exception: cancel_job( message='Failed to prepare Agave job definition', exception=generic_exception) if rx.local: print(json.dumps(agave_job, indent=4)) sys.exit(0) # Launch the Agave job agave_job_id = None try: resp = rx.client.jobs.submit(body=agave_job) agave_job_id = None if 'id' in resp: agave_job_id = resp['id'] else: raise KeyError('Invalid response received from jobs.submit()') except HTTPError as h: http_err_resp = agaveutils.process_agave_httperror(h) fail_job( message='Encountered API error: {}'.format(http_err_resp), exception=HTTPError) except Exception as job_submit_exception: fail_job(message='Failed to launch {}'.format( agave_appid), exception=job_submit_exception) # Update the PipelineJob status # # This will create an entry in its history with an explicit link to # the job asset. If this doesn't succeed, we don't fail the job since # the expensive part (the Agave job) has been submitted. try: job_uri = job.canonicalize_job(agave_job_id) job.run(data={'job_link': job_uri}) except Exception as job_update_exception: rx.logger.warning( 'Unable to update status of job {} because {}'.format( job_uuid, job_update_exception)) # If no other exit state has been encountered, report success rx.on_success('ManagedPipelineJob {} is managing Agave job {} ({} usec)'.format( job_uuid, agave_job_id, rx.elapsed()))
def client_w_archive_path(mongodb_settings, pipelinejobs_config, agave, pipeline_uuid): return ManagedPipelineJob(mongodb_settings, pipelinejobs_config, agave=agave, archive_path='/products/v2/test123')