def create_validation_report(ct_id): ''' Function to generate a Validation Report for a Job as a bg task Args: request (django.request): request object with parameters needed for report generation Returns: location on disk ''' # get CombineTask (ct) ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) logger.info(ct.task_params) try: # check for livy session _check_livy_session() # set output path output_path = '/tmp/%s' % uuid.uuid4().hex # generate spark code spark_code = "from console import *\ngenerate_validation_report(spark, '%(output_path)s', %(task_params)s)" % { 'output_path':output_path, 'task_params':ct.task_params } logger.info(spark_code) # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # set archive filename of loose XML files archive_filename_root = '/tmp/%s.%s' % (ct.task_params['report_name'],ct.task_params['report_format']) # loop through partitioned parts, coalesce and write to single file logger.info('coalescing output parts') # glob parts export_parts = glob.glob('%s/part*' % output_path) logger.info('found %s documents to group' % len(export_parts)) # if output not found, exit if len(export_parts) == 0: ct.task_output_json = json.dumps({ 'error':'no output found', 'spark_output':results }) ct.save() # else, continue else: # set report_format report_format = ct.task_params['report_format'] # open new file for writing and loop through files with open(archive_filename_root, 'w') as fout, fileinput.input(export_parts) as fin: # if CSV or TSV, write first line of headers if report_format == 'csv': header_string = 'db_id,record_id,validation_scenario_id,validation_scenario_name,results_payload,fail_count' if len(ct.task_params['mapped_field_include']) > 0: header_string += ',' + ','.join(ct.task_params['mapped_field_include']) fout.write('%s\n' % header_string) if report_format == 'tsv': header_string = 'db_id\trecord_id\tvalidation_scenario_id\tvalidation_scenario_name\tresults_payload\tfail_count' if len(ct.task_params['mapped_field_include']) > 0: header_string += '\t' + '\t'.join(ct.task_params['mapped_field_include']) fout.write('%s\n' % header_string) # loop through output and write for line in fin: fout.write(line) # removing partitioned output logger.info('removing dir: %s' % output_path) shutil.rmtree(output_path) # optionally, compress file if ct.task_params['compression_type'] == 'none': logger.info('no compression requested, continuing') output_filename = archive_filename_root elif ct.task_params['compression_type'] == 'zip': logger.info('creating compressed zip archive') report_format = 'zip' # establish output archive file output_filename = '%s.zip' % (archive_filename_root) with zipfile.ZipFile(output_filename,'w', zipfile.ZIP_DEFLATED) as zip: zip.write(archive_filename_root, archive_filename_root.split('/')[-1]) # tar.gz elif ct.task_params['compression_type'] == 'targz': logger.info('creating compressed tar archive') report_format = 'targz' # establish output archive file output_filename = '%s.tar.gz' % (archive_filename_root) with tarfile.open(output_filename, 'w:gz') as tar: tar.add(archive_filename_root, arcname=archive_filename_root.split('/')[-1]) # save validation report output to Combine Task output ct.task_output_json = json.dumps({ 'report_format':report_format, 'mapped_field_include':ct.task_params['mapped_field_include'], 'output_dir':output_path, 'output_filename':output_filename, 'results':results }) ct.save() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def job_new_validations(ct_id): ''' - submit livy job and poll until complete - use livy session from cjob (works, but awkward way to get this) ''' # get CombineTask (ct) try: # check for livy session _check_livy_session() ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # generate spark code spark_code = 'from jobs import RunNewValidationsSpark\nRunNewValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { 'job_id':cjob.job.id, 'validation_scenarios':str([ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ]), } logger.info(spark_code) # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # loop through validation jobs, and remove from DB if share validation scenario cjob.job.remove_validation_jobs(validation_scenarios=[ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ]) # update job_details cjob.job.refresh_from_db() # remove validation results cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' }) cjob.job.save() # update scenarios validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] validation_scenarios.extend(ct.task_params['validation_scenarios']) cjob.job.update_job_details({ 'validation_scenarios':validation_scenarios }, save=True) # write validation links logger.info('writing validations job links') for vs_id in ct.task_params['validation_scenarios']: val_job = models.JobValidation( job=cjob.job, validation_scenario=models.ValidationScenario.objects.get(pk=vs_id) ) val_job.save() # update failure counts logger.info('updating failure counts for new validation jobs') for jv in cjob.job.jobvalidation_set.filter(failure_count=None): jv.validation_failure_count(force_recount=True) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'run_new_validations':results }) ct.save() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def job_reindex(ct_id): ''' Background tasks to re-index Job - submit livy job and poll until complete - use livy session from cjob (works, but awkward way to get this) ''' # get CombineTask (ct) try: # check for livy session _check_livy_session() ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # drop Job's ES index cjob.job.drop_es_index(clear_mapped_field_analysis=False) # drop previous index mapping failures cjob.job.remove_mapping_failures_from_db() # generate spark code spark_code = 'from jobs import ReindexSparkPatch\nReindexSparkPatch(spark, job_id="%(job_id)s", fm_config_json=\'\'\'%(fm_config_json)s\'\'\').spark_function()' % { 'job_id':cjob.job.id, 'fm_config_json':ct.task_params['fm_config_json'] } # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # get new mapping mapped_field_analysis = cjob.count_indexed_fields() cjob.job.update_job_details({ 'field_mapper_config':json.loads(ct.task_params['fm_config_json']), 'mapped_field_analysis':mapped_field_analysis }, save=True) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'reindex_results':results }) ct.save() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def export_documents(ct_id): ''' - submit livy job and poll until complete - use livy session from cjob (works, but awkward way to get this) - add wrapper element to file parts - rename file parts - tar/zip together ''' # get CombineBackgroundTask ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # generate spark code output_path = '/tmp/%s' % str(uuid.uuid4()) # handle single Job if 'job_id' in ct.task_params.keys(): # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # set archive filename of loose XML files archive_filename_root = 'j_%s_documents' % cjob.job.id # build job_dictionary job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} logger.info(job_dict) # handle published records if 'published' in ct.task_params.keys(): # set archive filename of loose XML files archive_filename_root = 'published_documents' # get anonymous CombineJob cjob = models.CombineJob() # get published records to determine sets pr = models.PublishedRecords(subset=ct.task_params['subset']) # init job dictionary job_dict = {} # handle published jobs with publish set ids for publish_id, jobs in pr.sets.items(): job_dict[publish_id] = [ job.id for job in jobs ] # handle "loose" Jobs job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] # debug logger.info(job_dict) # update task params ct.refresh_from_db() ct.update_task_params({ 'output_path':output_path, 'archive_filename_root':archive_filename_root, 'job_dict':job_dict }) # prepare spark code spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id)) logger.info(spark_code) try: # check for livy session _check_livy_session() # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # handle s3 bucket if ct.task_params.get('s3_export', False): if ct.task_params.get('s3_export_type') == 'archive': logger.debug('writing archive file to S3') # create single archive file ct = _create_export_documents_archive(ct) # upload to s3 s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ .put(Body=open(ct.task_params['export_output_archive'],'rb')) # delete all traces from local output shutil.rmtree(ct.task_params['output_path']) elif ct.task_params.get('s3_export_type') == 'spark_df': logger.debug('s3 export type was spark_df, nothing to cleanup or do') # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 's3_export_type':ct.task_params['s3_export_type'], 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), }) ct.save() logger.info(ct.task_output_json) # handle local filesystem else: # create single archive file ct = _create_export_documents_archive(ct) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'export_output':ct.task_params['export_output_archive'], 'name':ct.task_params['export_output_archive'].split('/')[-1], 'content_type':ct.task_params['content_type'], 'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1]) }) ct.save() logger.info(ct.task_output_json) except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def job_dbdm(ct_id): # get CombineTask (ct) try: # check for livy session _check_livy_session() ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # set dbdm as False for all Records in Job clear_result = models.mc_handle.combine.record.update_many({'job_id':cjob.job.id},{'$set':{'dbdm':False}}, upsert=False) # generate spark code spark_code = 'from jobs import RunDBDM\nRunDBDM(spark, job_id="%(job_id)s", dbdd_id=%(dbdd_id)s).spark_function()' % { 'job_id':cjob.job.id, 'dbdd_id':int(ct.task_params['dbdd_id']) } logger.info(spark_code) # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # update job_details cjob.job.refresh_from_db() # get dbdd dbdd = models.DPLABulkDataDownload.objects.get(pk=int(ct.task_params['dbdd_id'])) cjob.job.update_job_details({ 'dbdm':{ 'dbdd':int(ct.task_params['dbdd_id']), 'dbdd_s3_key':dbdd.s3_key, 'matches':None, 'misses':None } }) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'job_id':ct.task_params['job_id'], 'dbdd_id':ct.task_params['dbdd_id'], 'dbdd_results':results }) ct.save() logger.info(ct.task_output_json) except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def job_remove_validation(ct_id): ''' Task to remove a validation, and all failures, from a Job ''' # get CombineTask (ct) try: # check for livy session _check_livy_session() ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # get Job Validation and delete jv = models.JobValidation.objects.get(pk=int(ct.task_params['jv_id'])) # delete validation failures associated with Validation Scenario and Job delete_results = jv.delete_record_validation_failures() # update valid field in Records via Spark # generate spark code spark_code = 'from jobs import RemoveValidationsSpark\nRemoveValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { 'job_id':cjob.job.id, 'validation_scenarios':str([ jv.validation_scenario.id ]), } logger.info(spark_code) # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # remove Job Validation from job_details cjob.job.refresh_from_db() # remove validation results cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' }) cjob.job.save() validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] if jv.validation_scenario.id in validation_scenarios: validation_scenarios.remove(jv.validation_scenario.id) cjob.job.update_job_details({ 'validation_scenarios':validation_scenarios }, save=True) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'delete_job_validation':str(jv), 'validation_failures_removed_':delete_results }) ct.save() # remove job validation link jv.delete() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()