Python ManagedPipelineJob示例，datacatalog.managers.pipelinejobs.ManagedPipelineJob Python示例

示例#1

0

显示文件

def test_none_product_patterns(mongodb_settings, agave, pipelinejobs_config,
                               pipeline_uuid, random_dir_name, admin_token):
    """Confirm that product_patterns can be set to None without failure
    """
    archive_path = '/sample/tacc-cloud/' + random_string(32)
    mpj = ManagedPipelineJob(mongodb_settings,
                             pipelinejobs_config,
                             agave=agave,
                             archive_path=archive_path,
                             experiment_id='experiment.tacc.10001',
                             product_patterns=None,
                             archive_patterns=[]).setup()
    mpj.cancel(token=admin_token)

示例#2

0

显示文件

def test_no_archive_and_product_patterns(mongodb_settings, agave,
                                         pipelinejobs_config, pipeline_uuid,
                                         random_dir_name, admin_token):
    """Confirm that archive_patterns and product_patterns can be entirely un-
    specified without failure
    """
    archive_path = '/sample/tacc-cloud/' + random_dir_name
    mpj = ManagedPipelineJob(mongodb_settings,
                             pipelinejobs_config,
                             agave=agave,
                             archive_path=archive_path,
                             experiment_id='experiment.tacc.10001').setup()
    mpj.cancel(token=admin_token)

示例#3

0

显示文件

def client_w_sample_archive_path(mongodb_settings, pipelinejobs_config, agave,
                                 pipeline_uuid, admin_token):
    mpj = ManagedPipelineJob(
        mongodb_settings,
        pipelinejobs_config,
        agave=agave,
        archive_path='/sample/tacc-cloud',
        experiment_id='experiment.tacc.10001',
        archive_patterns=[{
            'patterns': ['.json$'],
            'level': '2'
        }],
        product_patterns=[{
            'patterns': ['.json$'],
            'derived_using': [
                '1092d775-0f7c-5b4d-970f-e739711d5f36',
                'modified_ecoli_MG1655_refFlat_txt-0-1-0'
            ],
            'derived_from': [
                '105fb204-530b-5915-9fd6-caf88ca9ad8a',
                '1058868c-340e-5d8c-b66e-9739cbcf8d36', './672.png',
                'agave://data-sd2e-community/sample/tacc-cloud/dawnofman.jpg'
            ]
        }])
    return mpj

示例#4

0

显示文件

def client_w_param_data(mongodb_settings, pipelinejobs_config, agave,
                        pipeline_uuid, experiment_id, job_data):
    return ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              experiment_id=experiment_id,
                              data=job_data)

示例#5

0

显示文件

def instanced_client_w_param(mongodb_settings, pipelinejobs_config, agave,
                             pipeline_uuid, experiment_id):
    return ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              experiment_id=experiment_id,
                              instanced=True)

示例#6

0

显示文件

def test_pipejob_init_archive_path_custom(mongodb_settings,
                                          pipelinejobs_config, agave,
                                          pipeline_uuid):
    """Checks that passing archive_path=<val> overrides generated value
    """
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              archive_path='/products/v2/test123')
    assert base.archive_path.startswith('/products/v2')
    assert base.archive_path.endswith('test123')

示例#7

0

显示文件

def client_w_sample_archive_path_missed_ref(mongodb_settings,
                                            pipelinejobs_config, agave,
                                            pipeline_uuid, admin_token):
    mpj = ManagedPipelineJob(
        mongodb_settings,
        pipelinejobs_config,
        agave=agave,
        archive_path='/sample/tacc-cloud',
        experiment_id='experiment.tacc.10001',
        archive_patterns=[{
            'patterns': ['.json$'],
            'level': '2'
        }],
        product_patterns=[{
            'patterns': ['.json$'],
            'derived_using': [
                '1092d775-0f7c-5b4d-970f-e739711d5f36',
                'modified_ecoli_MG1655_refFlat_txt-0-1-0'
            ],
            'derived_from': [
                '105fb204-530b-5915-9fd6-caf88ca9ad8a',
                '1058868c-340e-5d8c-b66e-9739cbcf8d36', './672.png',
                'agave://data-sd2e-community/sample/tacc-cloud/dawnofman.jpg'
            ]
        }])

    def initjob():
        mpj.setup()
        mpj.run(token=admin_token)
        mpj.finish(token=admin_token)
        return mpj

    job = None
    try:
        job = initjob()
    except ManagedPipelineJobError:
        mpj.reset(token=admin_token, no_clear_path=True, permissive=True)
        job = initjob()

    # print('MPJ.UUID', job.uuid)
    return job

示例#8

0

显示文件

def test_uuid_bypass_invalid_metadata(mongodb_settings, agave,
                                      pipelinejobs_config, pipeline_uuid,
                                      random_dir_name, admin_token):
    """Confirm that passing a UUID directly to ManagedPipelineJob metadata
    binding stage bypasses identifier resolution"""
    archive_path = '/sample/tacc-cloud/' + random_string(32)
    ident = typeduuid.catalog_uuid('ThisCanNeverEverEverWork', 'experiment')
    mpj = ManagedPipelineJob(mongodb_settings,
                             pipelinejobs_config,
                             agave=agave,
                             archive_path=archive_path,
                             experiment_id=ident,
                             product_patterns=[],
                             archive_patterns=[]).setup()

示例#9

0

显示文件

def test_invalid_metadata_child_of(mongodb_settings, agave,
                                   pipelinejobs_config, pipeline_uuid,
                                   random_dir_name, admin_token):
    """Confirm that passing an unknown identifier not in the database will
    cause ManagedPipelineJob initialization to raise an Exception
    """
    archive_path = '/sample/tacc-cloud/' + random_string(32)
    with pytest.raises(ValueError):
        mpj = ManagedPipelineJob(mongodb_settings,
                                 pipelinejobs_config,
                                 agave=agave,
                                 archive_path=archive_path,
                                 experiment_id='ThisCanNeverEverEverWork',
                                 product_patterns=[],
                                 archive_patterns=[]).setup()

示例#10

0

显示文件

def test_pipejob_inputs_list(mongodb_settings, pipelinejobs_config, agave,
                             pipeline_uuid):

    inputs = [
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs'
    ]
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              inputs=inputs,
                              experiment_id='experiment.ginkgo.10001')
    # Only the two inputs and the first parameter have resolvable UUID in the test data set
    # The experiment_id will resolve as well
    assert len(base.acted_on) == 2

示例#11

0

显示文件

def test_pipejob_data_parameters_resolve(mongodb_settings, pipelinejobs_config,
                                         agave, pipeline_uuid):

    data = {
        'parameters': {
            'param1':
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
            'param2':
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs'
        }
    }
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              data=data)
    # Only the two inputs and the first parameter have resolvable UUID in the test data set
    assert len(base.acted_on) == 2

示例#12

0

显示文件

def test_pipejob_data_inputs_list_resolve(mongodb_settings,
                                          pipelinejobs_config, agave,
                                          pipeline_uuid):

    data = {
        'inputs': [
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs',
            '/uploads/transcriptic/201808/yeast_gates/r1bsmgdayg2yq_r1bsu7tb7bsuk/6389_0.0003_4.fcs'
        ]
    }
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              data=data)
    # Only the first two inputs resolvable since the list context expects fully-qualified URIs
    assert len(base.acted_on) == 2

示例#13

0

显示文件

def test_pipejob_inputs_expt_id(mongodb_settings, pipelinejobs_config, agave,
                                pipeline_uuid, recid, raises_exception):
    inputs = [
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs'
    ]
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              inputs=inputs,
                              experiment_id=recid)
    # Only the two inputs and the first parameter have resolvable UUID in the test data set
    assert len(base.child_of) > 0
    # The hashid value for measurement.tacc.0x00000000, the default
    # Should NOT be present if we resolve measurements
    assert '5pQxBkRrRe2GEPOWWZBq4LNQ' not in base._archive_path_els
    assert '/106' in base.archive_path

示例#14

0

显示文件

def test_pipejob_data_params_refs_resolve(mongodb_settings,
                                          pipelinejobs_config, agave,
                                          pipeline_uuid):

    data = {
        'parameters': {
            'structure': 'https://www.rcsb.org/structure/6N0V',
            'protein': 'https://www.uniprot.org/uniprot/G0S6G2'
        }
    }
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              data=data)
    # ^^ These references should be present in the database if test_stores_reference has been run
    assert len(base.acted_on) == 0
    assert len(base.acted_using) == 2

示例#15

0

显示文件

def test_pipejob_inputs_no_link_or_data(mongodb_settings, pipelinejobs_config,
                                        agave, pipeline_uuid):
    inputs = [
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
        'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs'
    ]
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              inputs=inputs)
    # Only the two inputs and the first parameter have resolvable UUID in the test data set
    assert len(base.child_of) > 0
    # The hashid value for measurement.tacc.0x00000000, the default
    assert '5pQxBkRrRe2GEPOWWZBq4LNQ' in base._archive_path_els
    # The hashid value for an empty "data" dictionary
    assert 'PAVpwrObxp5YjYRvrJOd5yVp' in base._archive_path_els
    # Pipeline ID is exposed in the path because there is a 1:1 mapping
    assert '/106' in base.archive_path

示例#16

0

显示文件

def test_pipejob_agave_uri_from_data(mongodb_settings, pipelinejobs_config,
                                     agave, pipeline_uuid):

    data = {
        'inputs': [
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.00015_2.fcs',
            'agave://data-sd2e-community/products/v1/41e1dec1-2940-5b04-bd9e-54af78f30774/aaf646a5-7c05-5ab3-a144-5563fca6830d/a4609424-508a-555c-9720-5ee3df44e777/whole-shrew-20181207T220030Z/output/output.csv'
        ],
        'parameters': {
            'p1':
            'agave://data-sd2e-community/uploads/transcriptic/201808/yeast_gates/r1bsmggea748b_r1bsun4yb67e7/wt-control-1_0.0003_4.fcs',
            'p2': '/uploads/456.txt'
        }
    }
    base = ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              data=data)
    # Only the two inputs and the first parameter have resolvable UUID in the test data set
    assert len(base.acted_on) == 3

示例#17

0

显示文件

文件： reactor.py 项目： SD2E/pipelinejobs-agave-proxy

def main():
    rx = Reactor()
    mes = AttrDict(rx.context.message_dict)

    mongodb_conn = rx.settings.mongodb

    # ! This code fixes an edge case in JSON serialization
    if mes == {}:
        try:
            jsonmsg = json.loads(rx.context.raw_message)
            mes = AttrDict(jsonmsg)
        except Exception as exc:
            rx.on_failure('Failed to load JSON from message', exc)

    # Check incoming message against the default JSON schema
    try:
        rx.validate_message(mes, permissive=False)
    except Exception as exc:
        rx.on_failure('Failed to validate message to schema', exc)

    # Verify appId is known to Agave apps API. Requires the invoking
    # user has a tenant admin role unless the appId is public
    agave_job = mes.get('job_definition')
    agave_appid = agave_job.get('appId')
    agave_app_details = None
    job_params = mes.get('parameters')
    instanced_archive_path = mes.get('instanced', True)
    rx.logger.info(
        'Received request to manage execution of {}'.format(agave_appid))
    try:
        agave_app_details = rx.client.apps.get(appId=agave_appid)
    except HTTPError as http_err:
        rx.on_failure(
            '{} is not a known Agave application'.format(
                agave_appid), http_err)
    except Exception as generic_exception:
        rx.on_failure(
            'Failed to look up Agave application', generic_exception)

    # Look up the Pipeline record for this Agave appId.
    #
    # Note that this requires a convention where the standalone Agave app is
    # registered in the Pipelines system with pipeline.id == agave.app.id
    pipeline_uuid = None
    try:
        manager_stores = Manager.init_stores(mongodb_conn)
        pipeline_rec = manager_stores['pipeline'].find_one_by_id(id=agave_appid)
        if pipeline_rec is None:
            raise ValueError("No 'pipelines' record found in database")
        else:
            pipeline_uuid = pipeline_rec.get('uuid')
    except Exception as generic_exception:
        rx.on_failure('Failed to resolve appId {} to a Pipeline record'.format(
            agave_appid), generic_exception)

    def cancel_job(message='an error occurred', exception=None):
        """Helper function to cancel a failed job
        """
        fmt_message = 'PipelineJob {} canceled because {}'.format(
            job_uuid, message)
        try:
            job.cancel()
        except Exception as job_cancel_exception:
            rx.logger.warning(
                'Failed to cancel PipelineJob {} because {}'.format(
                    job_uuid, job_cancel_exception))

        rx.on_failure(fmt_message, exception)

    def fail_job(message='an error occurred', exception=None):
        """Helper function to fail a job
        """
        fmt_message = 'PipelineJob {} failed because {}'.format(
            job_uuid, message)
        try:
            job.fail(data={'message': message})
        except Exception as job_fail_exception:
            rx.logger.warning(
                'Unable to update PipelineJob state for {} because {}'.format(
                    job_uuid, job_fail_exception))

        rx.on_failure(fmt_message, exception)

    # Initialize the ManagedPipelineJob. It will be in the jobs collection
    # with a status of CREATED.
    job = None
    job_uuid = None

    rx.logger.info('Building initial job.data')
    init_data = agave_job
    mes_data = mes.get('data', {})
    for k, v in job_params.items():
        if v is not None and isinstance(v, str):
            init_data[k] = v
    # init_data = {**init_data, **mes_data}

    try:
        job = ManagedPipelineJob(rx.settings.mongodb,
                                 rx.settings.pipelines.job_manager_id,
                                 rx.settings.pipelines.updates_nonce,
                                 pipeline_uuid=pipeline_uuid,
                                 data=init_data,
                                 session=rx.nickname,
                                 agent=rx.uid,
                                 task=rx.execid,
                                 instanced=instanced_archive_path,
                                 archive_path_patterns=mes.get(
                                     'index_patterns', []),
                                 ** job_params
                                 )

        job.setup(mes_data)

        job_uuid = job.uuid
    except Exception as generic_exception:
        if job is not None:
            cancel_job(message='Failed to set up ManagedPipelineJob',
                       exception=generic_exception)
        else:
            rx.on_failure('Failed to set up ManagedPipelineJob', generic_exception)

    # Extend the incoming Agave job definition to update the PipelineJob.
    # Set the archivePath and archiveSystem from the ManagedPipelineJob
    #
    # The former is accomplished by adding custom notifications built from
    # the job's 'callback' property, which was initialized on job.setup(). Any
    # pre-existing notifications (email, other callbacks) are preserved.
    try:
        if 'notifications' not in agave_job:
            agave_job['notifications'] = list()
        # for event in ('SUBMITTING', 'STAGING_JOB', 'RUNNING', 'ARCHIVING', 'ARCHIVING_FINIS', 'FINISHED', 'FAILED'):
            # Capture all Agave job states
            notification = {'event': '*',
                            'persistent': True,
                            'url': job.callback + '&status=${STATUS}&note=${JOB_ERROR}'}
            agave_job['notifications'].append(notification)

            notification = {'event': 'FINISHED',
                            'persistent': False,
                            'url': job.indexer_callback}
            agave_job['notifications'].append(notification)

        agave_job['archiveSystem'] = job.archive_system
        agave_job['archivePath'] = job.archive_path
        agave_job['archive'] = True

    except Exception as generic_exception:
        cancel_job(
            message='Failed to prepare Agave job definition',
            exception=generic_exception)

    if rx.local:
        print(json.dumps(agave_job, indent=4))
        sys.exit(0)

    # Launch the Agave job
    agave_job_id = None
    try:
        resp = rx.client.jobs.submit(body=agave_job)
        agave_job_id = None
        if 'id' in resp:
            agave_job_id = resp['id']
        else:
            raise KeyError('Invalid response received from jobs.submit()')
    except HTTPError as h:
        http_err_resp = agaveutils.process_agave_httperror(h)
        fail_job(
            message='Encountered API error: {}'.format(http_err_resp),
            exception=HTTPError)
    except Exception as job_submit_exception:
        fail_job(message='Failed to launch {}'.format(
            agave_appid), exception=job_submit_exception)

    # Update the PipelineJob status
    #
    # This will create an entry in its history with an explicit link to
    # the job asset. If this doesn't succeed, we don't fail the job since
    # the expensive part (the Agave job) has been submitted.
    try:
        job_uri = job.canonicalize_job(agave_job_id)
        job.run(data={'job_link': job_uri})
    except Exception as job_update_exception:
        rx.logger.warning(
            'Unable to update status of job {} because {}'.format(
                job_uuid, job_update_exception))

    # If no other exit state has been encountered, report success
    rx.on_success('ManagedPipelineJob {} is managing Agave job {} ({} usec)'.format(
        job_uuid, agave_job_id, rx.elapsed()))

示例#18

0

显示文件

def client_w_archive_path(mongodb_settings, pipelinejobs_config, agave,
                          pipeline_uuid):
    return ManagedPipelineJob(mongodb_settings,
                              pipelinejobs_config,
                              agave=agave,
                              archive_path='/products/v2/test123')