Пример #1
0
def job_harvest_tabular_data(request,
                             org_id,
                             record_group_id,
                             hash_payload_filename=False):
    """
        Create a new static XML Harvest Job
        """

    # retrieve record group
    record_group = RecordGroup.objects.filter(id=record_group_id).first()

    # get validation scenarios
    validation_scenarios = ValidationScenario.objects.all()

    # get field mappers
    field_mappers = FieldMapper.objects.all()

    # get record identifier transformation scenarios
    rits = RecordIdentifierTransformation.objects.all()

    # get all bulk downloads
    bulk_downloads = DPLABulkDataDownload.objects.all()

    # if GET, prepare form
    if request.method == 'GET':
        # render page
        return render(
            request, 'core/job_harvest_tabular_data.html', {
                'record_group': record_group,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'bulk_downloads': bulk_downloads,
                'breadcrumbs': breadcrumb_parser(request)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=HarvestTabularDataJob,
            job_params=request.POST,
            files=request.FILES,
            hash_payload_filename=hash_payload_filename)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group',
                        org_id=org_id,
                        record_group_id=record_group.id)
Пример #2
0
def job_analysis(request):
    """
    Run new analysis job
    """

    # if GET, prepare form
    if request.method == 'GET':

        # retrieve jobs (limiting if needed)
        input_jobs = Job.objects.all()

        # limit if analysis_type set
        analysis_type = request.GET.get('type', None)
        subset = request.GET.get('subset', None)
        if analysis_type == 'published':

            # load PublishedRecords
            published = PublishedRecords(subset=subset)

            # define input_jobs
            input_jobs = published.published_jobs

        else:
            published = None

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(
            request, 'core/job_analysis.html', {
                'job_select_type': 'multiple',
                'input_jobs': input_jobs,
                'published': published,
                'validation_scenarios': validation_scenarios,
                'rits': rits,
                'field_mappers': field_mappers,
                'xml2kvp_handle': xml2kvp.XML2kvp(),
                'analysis_type': analysis_type,
                'bulk_downloads': bulk_downloads,
                'job_lineage_json': json.dumps(job_lineage)
            })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            # TODO: record_group=record_group,
            job_type_class=AnalysisJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status is False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('analysis')
Пример #3
0
def test_static_harvest(VO):
    '''
    Test static harvest of XML records from disk
    '''

    # copy test data to /tmp
    payload_dir = '/tmp/%s' % uuid.uuid4().hex
    shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir)

    # emulate request.POST
    request_dict = {
        'dbdd': '',
        'job_note': '',
        'xpath_record_id': '',
        'static_filepath': payload_dir,
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'static_payload': '',
        'job_name': '',
        'field_mapper': 'default',
        'rits': '',
        'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"',
        'document_element_root': 'mods:mods'
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # init job, using Variable Object (VO)
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=HarvestStaticXMLJob,
                                       job_params=query_dict,
                                       files={},
                                       hash_payload_filename=False)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.static_harvest_cjob = cjob

    # remove payload_dir
    shutil.rmtree(payload_dir)

    # assert job is done and available via livy
    assert VO.static_harvest_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.static_harvest_cjob.job.record_count == 250

    # assert no indexing failures
    assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0
Пример #4
0
def test_merge_duplicate(VO):
    '''
    Duplicate Transform job, applying newly created validation scenarios
    '''

    # emulate request.POST
    request_dict = {
        'dbdd': '',
        'field_mapper': 'default',
        'filter_dupe_record_ids': 'true',
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'input_es_query_valve': '',
        'input_numerical_valve': '',
        'input_validity_valve': 'all',
        'job_name': '',
        'job_note': '',
        'rits': ''
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # set input jobs with QueryDict.setlist
    query_dict.setlist(
        'input_job_id',
        [VO.static_harvest_cjob.job.id, VO.static_transform_cjob.job.id])
    # set validation scenarios with QueryDict.setlist
    query_dict.setlist('validation_scenario', [
        VO.schematron_validation_scenario.id, VO.python_validation_scenario.id
    ])

    # init job
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=MergeJob,
                                       job_params=query_dict)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.merge_cjob = cjob

    # assert job is done and available via livy
    assert VO.merge_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.merge_cjob.job.record_count == 250

    # assert validation scenarios applied
    job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all()
    assert job_validation_scenarios.count() == 2

    # loop through validation scenarios and confirm that both show 250 failures
    for jv in job_validation_scenarios:
        assert jv.get_record_validation_failures().count() == 232

    # assert no indexing failures
    assert len(VO.merge_cjob.get_indexing_failures()) == 0
Пример #5
0
def test_static_transform(VO):
    '''
    Test static harvest of XML records from disk
    '''

    # prepare and capture temporary transformation scenario
    VO.transformation_scenario = prepare_transform()

    # emulate request.POST
    request_dict = {
        'dbdd':
        '',
        'field_mapper':
        'default',
        'filter_dupe_record_ids':
        'true',
        'fm_config_json':
        '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}',
        'input_es_query_valve':
        '',
        'input_job_id':
        VO.static_harvest_cjob.job.id,
        'input_numerical_valve':
        '',
        'input_validity_valve':
        'all',
        'job_name':
        '',
        'job_note':
        '',
        'rits':
        '',
        'sel_trans_json':
        '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id
    }
    query_dict = QueryDict('', mutable=True)
    query_dict.update(request_dict)

    # init job
    cjob = CombineJob.init_combine_job(user=VO.user,
                                       record_group=VO.rg,
                                       job_type_class=TransformJob,
                                       job_params=query_dict)

    # start job and update status
    job_status = cjob.start_job()

    # if job_status is absent, report job status as failed
    if job_status == False:
        cjob.job.status = 'failed'
        cjob.job.save()

    # poll until complete
    for x in range(0, 480):

        # pause
        time.sleep(1)

        # refresh session
        cjob.job.update_status()

        # check status
        if cjob.job.status != 'available':
            continue
        else:
            break

    # save static harvest job to VO
    VO.static_transform_cjob = cjob

    # assert job is done and available via livy
    assert VO.static_transform_cjob.job.status == 'available'

    # assert record count is 250
    assert VO.static_transform_cjob.job.record_count == 250

    # assert no indexing failures
    assert len(VO.static_transform_cjob.get_indexing_failures()) == 0

    # remove transformation
    assert VO.transformation_scenario.delete()[0] > 0
Пример #6
0
def job_merge(request, org_id, record_group_id):
    """
        Merge multiple jobs into a single job
        """

    # retrieve record group
    record_group = RecordGroup.objects.get(pk=record_group_id)

    # if GET, prepare form
    if request.method == 'GET':

        # get scope of input jobs and retrieve
        input_job_scope = request.GET.get('scope', None)

        # if all jobs, retrieve all jobs
        if input_job_scope == 'all_jobs':
            input_jobs = Job.objects.exclude(
                job_type='AnalysisJob').all()

        # else, limit to RecordGroup
        else:
            input_jobs = record_group.job_set.all()

        # get validation scenarios
        validation_scenarios = ValidationScenario.objects.all()

        # get record identifier transformation scenarios
        rits = RecordIdentifierTransformation.objects.all()

        # get field mappers
        field_mappers = FieldMapper.objects.all()

        # get job lineage for all jobs (filtered to input jobs scope)
        job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs)

        # get all bulk downloads
        bulk_downloads = DPLABulkDataDownload.objects.all()

        # render page
        return render(request, 'core/job_merge.html', {
            'job_select_type': 'multiple',
            'record_group': record_group,
            'input_jobs': input_jobs,
            'input_job_scope': input_job_scope,
            'validation_scenarios': validation_scenarios,
            'rits': rits,
            'field_mappers': field_mappers,
            'xml2kvp_handle': xml2kvp.XML2kvp(),
            'job_lineage_json': json.dumps(job_lineage),
            'bulk_downloads': bulk_downloads,
            'breadcrumbs': breadcrumb_parser(request)
        })

    # if POST, submit job
    if request.method == 'POST':

        cjob = CombineJob.init_combine_job(
            user=request.user,
            record_group=record_group,
            job_type_class=MergeJob,
            job_params=request.POST)

        # start job and update status
        job_status = cjob.start_job()

        # if job_status is absent, report job status as failed
        if job_status == False:
            cjob.job.status = 'failed'
            cjob.job.save()

        return redirect('record_group', org_id=org_id, record_group_id=record_group.id)