def job_harvest_tabular_data(request, org_id, record_group_id, hash_payload_filename=False): """ Create a new static XML Harvest Job """ # retrieve record group record_group = RecordGroup.objects.filter(id=record_group_id).first() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # if GET, prepare form if request.method == 'GET': # render page return render( request, 'core/job_harvest_tabular_data.html', { 'record_group': record_group, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=HarvestTabularDataJob, job_params=request.POST, files=request.FILES, hash_payload_filename=hash_payload_filename) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
def published(request, subset=None): """ Published records """ # get instance of Published model pub_records = PublishedRecords(subset=subset) # get field counts if pub_records.records.count() > 0: # get count of fields for all published job indices field_counts = pub_records.count_indexed_fields() else: field_counts = {} # get field mappers field_mappers = FieldMapper.objects.all() # get published subsets with PublishedRecords static method subsets = PublishedRecords.get_subsets() # loop through subsets and enrich for _ in subsets: # add counts counts = mc_handle.combine.misc.find_one( {'_id': 'published_field_counts_%s' % _['name']}) # if counts not yet calculated, do now if counts is None: counts = PublishedRecords( subset=_['name']).count_indexed_fields() _['counts'] = counts # generate hierarchy_dict job_hierarchy = _stateio_prepare_job_hierarchy() return render(request, 'core/published.html', { 'published': pub_records, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'field_counts': field_counts, 'es_index_str': pub_records.esi.es_index_str, 'subsets': subsets, 'job_hierarchy_json': json.dumps(job_hierarchy), 'job_hierarchy_json_subset': json.dumps( getattr(pub_records, 'ps_doc', {}).get('hierarchy', []) ), 'breadcrumbs': breadcrumb_parser(request) })
def job_analysis(request): """ Run new analysis job """ # if GET, prepare form if request.method == 'GET': # retrieve jobs (limiting if needed) input_jobs = Job.objects.all() # limit if analysis_type set analysis_type = request.GET.get('type', None) subset = request.GET.get('subset', None) if analysis_type == 'published': # load PublishedRecords published = PublishedRecords(subset=subset) # define input_jobs input_jobs = published.published_jobs else: published = None # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render( request, 'core/job_analysis.html', { 'job_select_type': 'multiple', 'input_jobs': input_jobs, 'published': published, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'analysis_type': analysis_type, 'bulk_downloads': bulk_downloads, 'job_lineage_json': json.dumps(job_lineage) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, # TODO: record_group=record_group, job_type_class=AnalysisJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status is False: cjob.job.status = 'failed' cjob.job.save() return redirect('analysis')
def job_merge(request, org_id, record_group_id): """ Merge multiple jobs into a single job """ # retrieve record group record_group = RecordGroup.objects.get(pk=record_group_id) # if GET, prepare form if request.method == 'GET': # get scope of input jobs and retrieve input_job_scope = request.GET.get('scope', None) # if all jobs, retrieve all jobs if input_job_scope == 'all_jobs': input_jobs = Job.objects.exclude( job_type='AnalysisJob').all() # else, limit to RecordGroup else: input_jobs = record_group.job_set.all() # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get record identifier transformation scenarios rits = RecordIdentifierTransformation.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() # get job lineage for all jobs (filtered to input jobs scope) job_lineage = Job.get_all_jobs_lineage(jobs_query_set=input_jobs) # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # render page return render(request, 'core/job_merge.html', { 'job_select_type': 'multiple', 'record_group': record_group, 'input_jobs': input_jobs, 'input_job_scope': input_job_scope, 'validation_scenarios': validation_scenarios, 'rits': rits, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'bulk_downloads': bulk_downloads, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': cjob = CombineJob.init_combine_job( user=request.user, record_group=record_group, job_type_class=MergeJob, job_params=request.POST) # start job and update status job_status = cjob.start_job() # if job_status is absent, report job status as failed if job_status == False: cjob.job.status = 'failed' cjob.job.save() return redirect('record_group', org_id=org_id, record_group_id=record_group.id)
def job_details(request, org_id, record_group_id, job_id): LOGGER.debug('details for job id: %s', job_id) # get CombineJob cjob = CombineJob.get_combine_job(job_id) # update status cjob.job.update_status() # detailed record count record_count_details = cjob.job.get_detailed_job_record_count() # get job lineage job_lineage = cjob.job.get_lineage() # get dpla_bulk_data_match dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches() # check if limiting to one, pre-existing record get_q = request.GET.get('q', None) # job details and job type specific augment job_detail = cjob.job.job_details_dict # mapped field analysis, generate if not part of job_details if 'mapped_field_analysis' in job_detail.keys(): field_counts = job_detail['mapped_field_analysis'] else: if cjob.job.finished: field_counts = cjob.count_indexed_fields() cjob.job.update_job_details( {'mapped_field_analysis': field_counts}, save=True) else: LOGGER.debug('job not finished, not setting') field_counts = {} # TODO: What is this accomplishing? # OAI Harvest if isinstance(cjob, HarvestOAIJob): pass # Static Harvest elif isinstance(cjob, HarvestStaticXMLJob): pass # Transform elif isinstance(cjob, TransformJob): pass # Merge/Duplicate elif isinstance(cjob, MergeJob): pass # Analysis elif isinstance(cjob, AnalysisJob): pass # get published records, primarily for published sets pub_records = PublishedRecords() oai_sets = Record.objects(job_id=cjob.job.id).item_frequencies(field='oai_set') # get published subsets with PublishedRecords static method published_subsets = PublishedRecords.get_subsets() # loop through subsets and enrich for _ in published_subsets: # add counts counts = mc_handle.combine.misc.find_one( {'_id': 'published_field_counts_%s' % _['name']}) # if counts not yet calculated, do now if counts is None: counts = PublishedRecords( subset=_['name']).count_indexed_fields() _['counts'] = counts # get field mappers field_mappers = FieldMapper.objects.all() # return return render(request, 'core/job_details.html', { 'cjob': cjob, 'record_group': cjob.job.record_group, 'record_count_details': record_count_details, 'field_counts': field_counts, 'field_mappers': field_mappers, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'job_lineage_json': json.dumps(job_lineage), 'dpla_bulk_data_matches': dpla_bulk_data_matches, 'q': get_q, 'job_details': job_detail, 'pr': pub_records, 'published_subsets': published_subsets, 'es_index_str': cjob.esi.es_index_str, 'breadcrumbs': breadcrumb_parser(request), 'oai_sets': dict(oai_sets) })
def job_update(request, org_id, record_group_id, job_id): """ Update Job in one of several ways: - re-map and index - run new / different validations """ # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # if GET, prepare form if request.method == 'GET': # get validation scenarios validation_scenarios = ValidationScenario.objects.all() # get field mappers field_mappers = FieldMapper.objects.all() orig_fm_config_json = cjob.job.get_fm_config_json() # get all bulk downloads bulk_downloads = DPLABulkDataDownload.objects.all() # get update type from GET params update_type = request.GET.get('update_type', None) # render page return render(request, 'core/job_update.html', { 'cjob': cjob, 'update_type': update_type, 'validation_scenarios': validation_scenarios, 'field_mappers': field_mappers, 'bulk_downloads': bulk_downloads, 'xml2kvp_handle': xml2kvp.XML2kvp(), 'orig_fm_config_json': orig_fm_config_json, 'breadcrumbs': breadcrumb_parser(request) }) # if POST, submit job if request.method == 'POST': LOGGER.debug('updating job') LOGGER.debug(request.POST) # retrieve job cjob = CombineJob.get_combine_job(int(job_id)) # get update type update_type = request.POST.get('update_type', None) LOGGER.debug('running job update: %s', update_type) # handle re-index if update_type == 'reindex': # get preferred metadata index mapper fm_config_json = request.POST.get('fm_config_json') # init re-index cjob.reindex_bg_task(fm_config_json=fm_config_json) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Re-Indexing Job:</strong><br>%s</p>' '<p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle new validations if update_type == 'validations': # get requested validation scenarios validation_scenarios = request.POST.getlist( 'validation_scenario', []) # get validations validations = ValidationScenario.objects.filter( id__in=[int(vs_id) for vs_id in validation_scenarios]) # init bg task cjob.new_validations_bg_task([vs.id for vs in validations]) # set gms gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Running New Validations for Job:</strong><br>%s<br>' '<br><strong>Validation Scenarios:</strong><br>%s</p>' '<p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, '<br>'.join([vs.name for vs in validations]), reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle validation removal if update_type == 'remove_validation': # get validation scenario to remove jv_id = request.POST.get('jv_id', False) # initiate Combine BG Task cjob.remove_validation_bg_task(jv_id) # set gms validation_scenario = JobValidation.objects.get( pk=int(jv_id)).validation_scenario gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Removing Validation for Job:</strong><br>%s<br><br>' '<strong>Validation Scenario:</strong><br>%s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, validation_scenario.name, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) # handle validation removal if update_type == 'dbdm': # get validation scenario to remove dbdd_id = request.POST.get('dbdd', False) # initiate Combine BG Task cjob.dbdm_bg_task(dbdd_id) # set gms dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) gmc = GlobalMessageClient(request.session) gmc.add_gm({ 'html': '<p><strong>Running DPLA Bulk Data comparison for Job:</strong><br>%s<br><br>' '<strong>Bulk Data S3 key:</strong><br>%s</p><p><a href="%s"><button type="button" ' 'class="btn btn-outline-primary btn-sm">View Background Tasks</button></a></p>' % ( cjob.job.name, dbdd.s3_key, reverse('bg_tasks')), 'class': 'success' }) return redirect('job_details', org_id=cjob.job.record_group.organization.id, record_group_id=cjob.job.record_group.id, job_id=cjob.job.id) if update_type == 'publish_set': update_body = request.POST if update_body.get('publish_set_id', None): cjob.job.publish_set_id = update_body['publish_set_id'] if update_body.get('existing_publish_set_id', None): cjob.job.publish_set_id = update_body['existing_publish_set_id'] redirect_anchor = update_body.get('redirect_anchor', '') cjob.job.save() return redirect(reverse('job_details', args=[org_id, record_group_id, job_id]) + redirect_anchor)