def process_datachecked_db(self, dc_job_id, spec): """ Task to wait until DCs finish and then respond e.g. * submit copy if DC succeed * send error email if not """ # allow infinite retries self.max_retries = None src_uri = spec['src_uri'] progress_msg = 'Datachecks in progress, please see: %sjobs/%s' % (cfg.dc_uri, dc_job_id) log_and_publish(make_report('INFO', progress_msg, spec, src_uri)) try: result = dc_client.retrieve_job(dc_job_id) except Exception as e: err_msg = 'Handover failed, cannot retrieve datacheck job' log_and_publish(make_report('ERROR', err_msg, spec, src_uri)) raise ValueError('Handover failed, cannot retrieve datacheck job %s' % e) from e if result['status'] in ['incomplete', 'running', 'submitted']: log_and_publish(make_report('DEBUG', 'Datacheck Job incomplete, checking again later', spec, src_uri)) raise self.retry() # check results elif result['status'] == 'failed': prob_msg = 'Datachecks found problems, you can download the output here: %sdownload_datacheck_outputs/%s' % (cfg.dc_uri, dc_job_id) log_and_publish(make_report('INFO', prob_msg, spec, src_uri)) msg = """ Running datachecks on %s completed but found problems. You can download the output here %s """ % (src_uri, cfg.dc_uri + "download_datacheck_outputs/" + str(dc_job_id)) send_email(to_address=spec['contact'], subject='Datacheck found problems', body=msg, smtp_server=cfg.smtp_server) else: log_and_publish(make_report('INFO', 'Datachecks successful, starting copy', spec, src_uri)) spec['progress_complete'] = 1 submit_copy(spec)
def process_result(self, event, process, job_id): """ Wait for the completion of the job and then process any output further """ # allow infinite retries self.max_retries = None genome = event['genome'] checking_msg = 'Checking %s event %s' % (process, job_id) log_and_publish(make_report('INFO', checking_msg, event, genome)) result = event_client.retrieve_job(process, job_id) if (result['status'] == 'incomplete') or (result['status'] == 'running') or (result['status'] == 'submitted'): log_and_publish( make_report('INFO', 'Job incomplete, retrying', event, genome)) raise self.retry() result_msg = 'Handling result for %s' % json.dumps(event) log_and_publish( make_report('DEBUG', 'Job incomplete, retrying', event, genome)) result_dump = json.dumps(result) if result['status'] == 'failure': log_and_publish( make_report('FATAL', 'Event failed: %s' % result_dump, event, genome)) else: log_and_publish( make_report('INFO', 'Event succeeded: %s' % result_dump, event, genome)) # TODO # 1. update metadata # 2. schedule new events as required return event['event_id']
def submit_metadata_update(spec): """Submit the source database for copying to the target. Returns a celery job identifier.""" src_uri = spec['src_uri'] try: metadata_job_id = metadata_client.submit_job(spec['tgt_uri'], None, None, None, None, spec['contact'], spec['comment'], 'Handover', None) except Exception as e: log_and_publish(make_report('ERROR', 'Handover failed, cannot submit metadata job', spec, src_uri)) raise ValueError('Handover failed, cannot submit metadata job %s' % e) from e spec['metadata_job_id'] = metadata_job_id task_id = process_db_metadata.delay(metadata_job_id, spec) dbg_msg = 'Submitted DB for metadata loading %s' % task_id log_and_publish(make_report('DEBUG', dbg_msg, spec, src_uri)) return task_id
def submit_copy(spec): """Submit the source database for copying to the target. Returns a celery job identifier""" src_uri = spec['src_uri'] try: copy_job_id = db_copy_client.submit_job(src_uri, spec['tgt_uri'], None, None, False, True, True, None, None) except Exception as e: log_and_publish(make_report('ERROR', 'Handover failed, cannot submit copy job', spec, src_uri)) raise ValueError('Handover failed, cannot submit copy job %s' % e) from e spec['copy_job_id'] = copy_job_id task_id = process_copied_db.delay(copy_job_id, spec) dbg_msg = 'Submitted DB for copying as %s' % task_id log_and_publish(make_report('DEBUG', 'Handover failed, cannot submit copy job', spec, src_uri)) return task_id
def jobs(process): """ Endpoint to retrieve all the jobs results from the database --- tags: - jobs parameters: - name: process in: path type: string required: true default: 1 description: process name operationId: jobs consumes: - application/json produces: - application/json security: jobs_auth: - 'write:jobs' - 'read:jobs' schemes: ['http', 'https'] deprecated: false externalDocs: description: Project repository url: http://github.com/rochacbruno/flasgger responses: 200: description: Retrieve all the jobs results from the database schema: $ref: '#/definitions/job_id' """ log_and_publish(make_report('INFO', 'Retrieving jobs')) return jsonify(get_hive(process).get_all_results(get_analysis(process)))
def results(process, job_id): log_and_publish(make_report('INFO', 'Retrieving job from %s with ID %s' % (process, job_id))) try: job_result = get_hive(process).get_result_for_job_id(job_id) except ValueError as e: raise HTTPRequestError(str(e), 404) return jsonify(job_result)
def drop_current_databases(current_db_list, spec): """Drop databases on a previous assembly or previous genebuild (e.g: Wormbase) from the staging MySQL server""" tgt_uri = spec['tgt_uri'] staging_uri = spec['staging_uri'] tgt_url = make_url(tgt_uri) #Check if the new database has the same name as the one on staging. In this case DO NOT drop it #This can happen if the assembly get renamed or genebuild version has changed for Wormbase if tgt_url.database in current_db_list: msg = 'The assembly or genebuild has been updated but the new database %s is the same as old one' % tgt_url.database log_and_publish(make_report('DEBUG', msg, spec, tgt_uri)) else: for database in current_db_list: db_uri = staging_uri + database if database_exists(db_uri): msg = 'Dropping %s' % db_uri log_and_publish(make_report('INFO', msg, spec, tgt_uri)) drop_database(db_uri)
def submit_event(spec, result): """Submit an event""" tgt_uri = spec['tgt_uri'] logger.debug(result['output']['events']) for event in result['output']['events']: logger.debug(event) event_client.submit_job({'type': event['type'], 'genome': event['genome']}) log_and_publish(make_report('DEBUG', 'Submitted event to event handler endpoint', spec, tgt_uri))
def test_make_report(self): expected = { 'params': {'test_param': 'test'}, 'resource': 'test_resource', 'report_type': 'TEST', 'msg': 'test_message' } report = make_report('TEST', 'test_message', {'test_param': 'test'}, 'test_resource') self.assertEqual(expected, report)
def results_email(email, process, job_id): log_and_publish(make_report('INFO', 'Retrieving job with ID %s for %s' % (job_id, email))) hive = get_hive(process) try: job = hive.get_job_by_id(job_id) results = hive.get_result_for_job_id(job_id) except ValueError as e: raise HTTPRequestError(str(e), 404) # TODO results['email'] = email return jsonify(results)
def process_copied_db(self, copy_job_id, spec): """Wait for copy to complete and then respond accordingly: * if success, submit to metadata database * if failure, flag error using email""" # allow infinite retries self.max_retries = None src_uri = spec['src_uri'] copy_in_progress_msg = 'Copying in progress, please see: %s%s' % (cfg.copy_web_uri, copy_job_id) log_and_publish(make_report('INFO', copy_in_progress_msg, spec, src_uri)) try: result = db_copy_client.retrieve_job(copy_job_id) except Exception as e: log_and_publish(make_report('ERROR', 'Handover failed, cannot retrieve copy job', spec, src_uri)) raise ValueError('Handover failed, cannot retrieve copy job %s' % e) from e if result['status'] in ['incomplete', 'running', 'submitted']: log_and_publish(make_report('DEBUG', 'Database copy job incomplete, checking again later', spec, src_uri)) raise self.retry() if result['status'] == 'failed': copy_failed_msg = 'Copy failed, please see: %s%s' % (cfg.copy_web_uri, copy_job_id) log_and_publish(make_report('INFO', copy_failed_msg, spec, src_uri)) msg = """ Copying %s to %s failed. Please see %s """ % (src_uri, spec['tgt_uri'], cfg.copy_web_uri + str(copy_job_id)) send_email(to_address=spec['contact'], subject='Database copy failed', body=msg, smtp_server=cfg.smtp_server) return elif 'GRCh37'in spec: log_and_publish(make_report('INFO', 'Copying complete, Handover successful', spec, src_uri)) spec['progress_complete'] = 2 else: log_and_publish(make_report('INFO', 'Copying complete, submitting metadata job', spec, src_uri)) spec['progress_complete'] = 2 submit_metadata_update(spec)
def submit_job(): """ Endpoint to submit an event to process --- tags: - jobs parameters: - in: body name: body description: event required: false schema: $ref: '#/definitions/submit' operationId: jobs consumes: - application/json produces: - application/json security: submit_auth: - 'write:submit' - 'read:submit' schemes: ['http', 'https'] deprecated: false externalDocs: description: Project repository url: http://github.com/rochacbruno/flasgger """ if json_pattern.match(request.headers['Content-Type']): event = request.json results = {"processes": [], "event": event} # convert event to processes processes = get_processes_for_event(event) for process in processes: log_and_publish(make_report('DEBUG', 'Submitting process %s' % process)) hive = get_hive(process) analysis = get_analysis(process) try: job = hive.create_job(analysis, {'event': event}) except ValueError as e: raise HTTPRequestError(str(e), 404) event_task = process_result.delay(event, process, job.job_id) results['processes'].append({ "process":process, "job":job.job_id, "task":event_task.id }) return jsonify(results); else: raise HTTPRequestError('Could not handle input of type %s' % request.headers['Content-Type'])
def process_db_metadata(self, metadata_job_id, spec): """Wait for metadata update to complete and then respond accordingly: * if success, submit event to event handler for further processing * if failure, flag error using email""" # allow infinite retries self.max_retries = None tgt_uri = spec['tgt_uri'] loading_msg = 'Loading into metadata database, please see: %sjobs/%s' % (cfg.meta_uri, metadata_job_id) log_and_publish(make_report('INFO', loading_msg, spec, tgt_uri)) try: result = metadata_client.retrieve_job(metadata_job_id) except Exception as e: err_msg = 'Handover failed, Cannot retrieve metadata job' log_and_publish(make_report('ERROR', err_msg, spec, tgt_uri)) raise ValueError('Handover failed, Cannot retrieve metadata job %s' % e) from e if result['status'] in ['incomplete', 'running', 'submitted']: incomplete_msg = 'Metadata load Job incomplete, checking again later' log_and_publish(make_report('DEBUG', incomplete_msg, spec, tgt_uri)) raise self.retry() if result['status'] == 'failed': drop_msg='Dropping %s' % tgt_uri log_and_publish(make_report('INFO', drop_msg, spec, tgt_uri)) drop_database(spec['tgt_uri']) failed_msg = 'Metadata load failed, please see %sjobs/%s?format=failures' % (cfg.meta_uri, metadata_job_id) log_and_publish(make_report('INFO', failed_msg, spec, tgt_uri)) msg = """ Metadata load of %s failed. Please see %s """ % (tgt_uri, cfg.meta_uri + 'jobs/' + str(metadata_job_id) + '?format=failures') send_email(to_address=spec['contact'], subject='Metadata load failed, please see: '+cfg.meta_uri+ 'jobs/' + str(metadata_job_id) + '?format=failures', body=msg, smtp_server=cfg.smtp_server) else: # Cleaning up old assembly or old genebuild databases for Wormbase when database suffix has changed if 'events' in result['output'] and result['output']['events']: for event in result['output']['events']: details = json.loads(event['details']) if 'current_database_list' in details : drop_current_databases(details['current_database_list'], spec) if event['genome'] in blat_species and event['type'] == 'new_assembly': msg = 'The following species %s has a new assembly, please update the port number for this species here and communicate to Web: https://github.com/Ensembl/ensembl-production/blob/master/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/DumpCore_conf.pm#L107' % event['genome'] send_email(to_address=cfg.production_email, subject='BLAT species list needs updating in FTP Dumps config', body=msg) log_and_publish(make_report('INFO', 'Metadata load complete, Handover successful', spec, tgt_uri)) spec['progress_complete'] = 3
def submit_dc(spec, src_url, db_type): """Submit the source database for checking. Returns a celery job identifier""" try: src_uri = spec['src_uri'] tgt_uri = spec['tgt_uri'] handover_token = spec['handover_token'] server_url = 'mysql://%s@%s:%s/' % (src_url.username, src_url.host, src_url.port) submitting_dc_msg = 'Submitting DC for %s on server: %s' % (src_url.database, server_url) submitting_dc_report = make_report('DEBUG', submitting_dc_msg, spec, src_uri) if db_type == 'compara': log_and_publish(submitting_dc_report) dc_job_id = dc_client.submit_job(server_url, src_url.database, None, None, db_type, None, db_type, 'critical', None, handover_token) elif db_type == 'ancestral': log_and_publish(submitting_dc_report) dc_job_id = dc_client.submit_job(server_url, src_url.database, None, None, 'core', None, 'ancestral', 'critical', None, handover_token) elif db_type in ['rnaseq', 'cdna', 'otherfeatures']: division_msg = 'division: %s' % get_division(src_uri, tgt_uri, db_type) log_and_publish(make_report('DEBUG', division_msg, spec, src_uri)) log_and_publish(submitting_dc_report) dc_job_id = dc_client.submit_job(server_url, src_url.database, None, None, db_type, None, 'corelike', 'critical', None, handover_token) else: db_msg = 'src_uri: %s dbtype %s server_url %s' % (src_uri, db_type, server_url) log_and_publish(make_report('DEBUG', db_msg, spec, src_uri)) division_msg = 'division: %s' % get_division(src_uri, tgt_uri, db_type) log_and_publish(make_report('DEBUG', division_msg, spec, src_uri)) log_and_publish(submitting_dc_report) dc_job_id = dc_client.submit_job(server_url, src_url.database, None, None, db_type, None, db_type, 'critical', None, handover_token) except Exception as e: err_msg = 'Handover failed, Cannot submit dc job' log_and_publish(make_report('ERROR', err_msg, spec, src_uri)) raise ValueError('Handover failed, Cannot submit dc job %s' % e) from e spec['dc_job_id'] = dc_job_id task_id = process_datachecked_db.delay(dc_job_id, spec) submitted_dc_msg = 'Submitted DB for checking as %s' % task_id log_and_publish(make_report('DEBUG', submitted_dc_msg, spec, src_uri)) return task_id
def handover_database(spec): """ Method to accept a new database for incorporation into the system Argument is a dict with the following keys: * src_uri - URI to database to handover (required) * tgt_uri - URI to copy database to (optional - generated from staging and src_uri if not set) * contact - email address of submitter (required) * comment - additional information about submission (required) The following keys are added during the handover process: * handover_token - unique identifier for this particular handover invocation * dc_job_id - job ID for datacheck process * db_job_id - job ID for database copy process * metadata_job_id - job ID for the metadata loading process * progress_total - Total number of task to do * progress_complete - Total number of task completed """ # TODO verify dict src_uri = spec['src_uri'] # create unique identifier spec['handover_token'] = str(uuid.uuid1()) spec['progress_total'] = 3 if not database_exists(src_uri): msg = "Handover failed, %s does not exist" % src_uri log_and_publish(make_report('ERROR', msg, spec, src_uri)) raise ValueError("%s does not exist" % src_uri) src_url = make_url(src_uri) #Scan database name and retrieve species or compara name, database type, release number and assembly version db_prefix, db_type, assembly = parse_db_infos(src_url.database) # Check if the given database can be handed over if db_type not in db_types_list: msg = "Handover failed, %s has been handed over after deadline. Please contact the Production team" % src_uri log_and_publish(make_report('ERROR', msg, spec, src_uri)) raise ValueError(msg) # Check if the database release match the handover service if db_type == 'compara': compara_release = get_release_compara(src_uri) if release != compara_release: msg = "Handover failed, %s database release version %s does not match handover service release version %s" % (src_uri,compara_release,release) log_and_publish(make_report('ERROR', msg, spec, src_uri)) raise ValueError(msg) else: db_release=get_release(src_uri) if release != db_release: msg = "Handover failed, %s database release version %s does not match handover service release version %s" % (src_uri,db_release,release) log_and_publish(make_report('ERROR', msg, spec, src_uri)) raise ValueError(msg) #Check to which staging server the database need to be copied to spec, staging_uri, live_uri = check_staging_server(spec, db_type, db_prefix, assembly) if 'tgt_uri' not in spec: spec['tgt_uri'] = get_tgt_uri(src_url, staging_uri) # Check that the database division match the target staging server if db_type in ['compara', 'ancestral']: db_division = db_prefix else: db_division = get_division(src_uri, spec['tgt_uri'], db_type) if db_division not in allowed_divisions_list: raise ValueError('Database division %s does not match server division list %s' % (db_division, allowed_divisions_list)) spec['staging_uri'] = staging_uri spec['progress_complete'] = 0 msg = "Handling %s" % spec log_and_publish(make_report('INFO', msg, spec, src_uri)) submit_dc(spec, src_url, db_type) return spec['handover_token']