def _execute_and_wait_for_job(job_data): job_id = JobsPersistence.create(job_data) # wait for job to finish: with beeline.tracer("waiting for workers"): period = 0.5 # check every X seconds n_checks = int(REQUEST_TIMEOUT / period) for _ in range(n_checks): job = JobsPersistence.get_by_id(job_id) if job["current_status"] in ["finished", "error"]: break time.sleep(0.5) JobsPersistence.delete(job_id) if job["current_status"] == "finished": results = json.loads(job["results"]) if len(results) != 1: return flask.make_response( jsonify( id=None, code=400, message= "This endpoint can only succeed if process graph yields exactly one result, instead it received: {}." .format(len(results)), links=[]), 400) s3 = boto3.client( 's3', endpoint_url=S3_LOCAL_URL, region_name="eu-central-1", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, ) result = results[0] filename = result["filename"] object_key = '{}/{}'.format(job_id, os.path.basename(filename)) s3_object = s3.get_object(Bucket=RESULTS_S3_BUCKET_NAME, Key=object_key) content = s3_object['Body'].read() response = flask.make_response(content, 200) response.mimetype = result["type"] return response if job["current_status"] == "error": return flask.make_response( jsonify(id=None, code=job["error_code"], message=job["error_msg"], links=[]), job["http_code"]) return flask.make_response( jsonify(id=None, code="Timeout", message="Request timed out.", links=[]), 408)
def api_jobs(): if flask.request.method == 'GET': jobs = [] links = [] for record in JobsPersistence.items(): jobs.append({ "id": record["id"], "title": record.get("title", None), "description": record.get("description", None), }) links.append({ "href": "{}/jobs/{}".format(flask.request.url_root, record.get("id")), "title": record.get("title", None), }) return { "jobs": jobs, "links": links, }, 200 elif flask.request.method == 'POST': data = flask.request.get_json() process_graph_schema = PostJobsSchema() errors = process_graph_schema.validate(data) if errors: # Response procedure for validation will depend on how openeo_pg_parser_python will work return flask.make_response('Invalid request: {}'.format(errors), 400) data["current_status"] = "submitted" data["should_be_cancelled"] = False record_id = JobsPersistence.create(data) # add requested headers to 201 response: response = flask.make_response('', 201) response.headers['Location'] = '/jobs/{}'.format(record_id) response.headers['OpenEO-Identifier'] = record_id return response
def worker_proc(jobs_queue, worker_number): # worker shouldn't be concerned with signals - parent orchestrates # everything and will tell us if we need to quit: signal.signal(signal.SIGINT, signal.SIG_IGN) while True: job = jobs_queue.get(True, None) logger.info("Worker {} received a job [{}]: {}".format( worker_number, job["job_id"], job)) if job == SIGNAL_QUIT_JOB: return # try to execute job's process graph: results = None error_msg = None error_code = None http_code = None try: results = _execute_process_graph(job["process_graph"], job["job_id"], job["variables"]) logger.info("Worker {} successfully finished job [{}]".format( worker_number, job["job_id"])) except Exception as ex: logger.exception("Worker {}, job [{}] exec failed: {}".format( worker_number, job["job_id"], str(ex))) error_msg = ex.msg if hasattr(ex, "msg") else str(ex) error_code = ex.error_code if hasattr(ex, "error_code") else "Internal" http_code = ex.http_code if hasattr(ex, "http_code") else 500 finally: job_id = job["job_id"] logger.info("Worker {} writing results for job [{}]".format( worker_number, job_id)) # write results: try: JobsPersistence.update_running_to_finished( job_id, results, error_msg, error_code, http_code) logger.info("Job {} finished.".format(job_id)) except: logger.exception( "Unknown error saving results, job will hang indefinitely! {}" .format(job_id))
def main(): # create workers: jobs_queue = multiprocessing.Queue() processes = [] for i in range(10): p = multiprocessing.Process(target=worker_proc, args=( jobs_queue, i, )) p.daemon = True p.start() processes.append(p) # listen for changes on DynamoDB jobs table and dispatch any new jobs to # the workers: running_jobs = set() try: while True: _feed_monitoring_system() # Because we couldn't find a way to get notifications about DynamoDB changes (via Streams) # without polling, we use SQS to be notified when new jobs surface. We still query DynamoDB # to get them though, even though we receive the job_ids: logger.info("Sleeping / waiting for wakeup:") wakeup = JobsPersistence.wait_for_wakeup(timeout=20) if not wakeup: logger.info("Continue sleeping...") continue logger.info("Woke up!") # GET queued AND should_be_cancelled = False new_queued = JobsPersistence.query_new_queued() for page in new_queued: for job in page["Items"]: job_id = job["id"]['S'] logger.info("Found a job: {}".format(job_id)) success = JobsPersistence.update_queued_to_running(job_id) if not success: # someone was faster than us - we were not able to mark it as running, # so we shouldn't execute it: logger.info( "Found a job, but could not update its status to running... ignoring it." ) continue running_jobs.add(job_id) jobs_queue.put({ 'job_id': job_id, 'process_graph': json.loads(job["process_graph"]['S']), 'variables': json.loads(job["variables"]['S']) if "variables" in job else {}, }) # GET queued AND should_be_cancelled = True cancelled_queued = JobsPersistence.query_cancelled_queued() for page in cancelled_queued: for job in page["Items"]: # Set them back to submitted: job_id = job["id"]['S'] JobsPersistence.update_cancelled_queued_to_submitted( job_id) # GET running AND should_be_cancelled = True cancelled_running = JobsPersistence.query_cancelled_running() for page in cancelled_running: for job in page["Items"]: job_id = job["id"]['S'] if job_id not in running_jobs: continue JobsPersistence.update_cancelled_running_to_canceled( job_id) # we don't actually kill the process (though that would be nice), we just mark that the # job is no longer running, so the results will not be used: running_jobs.remove(job_id) except KeyboardInterrupt: logger.info("SIGINT received, exiting.") # clean up and quit: for i in range(len(processes)): jobs_queue.put(SIGNAL_QUIT_JOB) for p in processes: p.join()
def add_job_to_queue(job_id): if flask.request.method == "POST": job = JobsPersistence.get_by_id(job_id) if job["current_status"] in [ "submitted", "finished", "canceled", "error" ]: JobsPersistence.update_status(job_id, "queued") return flask.make_response( 'The creation of the resource has been queued successfully.', 202) else: return flask.make_response( jsonify( id=job_id, code="JobLocked", message= 'Job is locked due to a queued or running batch computation.', links=[]), 400) elif flask.request.method == "GET": job = JobsPersistence.get_by_id(job_id) if job["current_status"] not in ["finished", "error"]: return flask.make_response( jsonify( id=job_id, code='JobNotFinished', message= 'Job has not finished computing the results yet. Please try again later.', links=[]), 400) if job["current_status"] == "error": return flask.make_response( jsonify(id=job_id, code=job["error_code"], message=job["error_msg"], links=[]), 424) s3 = boto3.client( 's3', endpoint_url=S3_LOCAL_URL, region_name="eu-central-1", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, ) links = [] results = json.loads(job["results"]) for result in results: # create signed url: filename = result["filename"] object_key = '{}/{}'.format(job_id, os.path.basename(filename)) url = s3.generate_presigned_url(ClientMethod='get_object', Params={ 'Bucket': RESULTS_S3_BUCKET_NAME, 'Key': object_key, }) mime_type = result["type"] links.append({ 'href': url, 'type': mime_type, }) return flask.make_response( jsonify( id=job_id, title=job.get("title", None), description=job.get("description", None), updated=job[ "last_updated"], # "updated" is a reserved word in DynamoDB links=links, ), 200) elif flask.request.method == "DELETE": job = JobsPersistence.get_by_id(job_id) if job["current_status"] in ["queued", "running"]: JobsPersistence.set_should_be_cancelled(job_id) return flask.make_response( 'Processing the job has been successfully canceled.', 200) return flask.make_response( jsonify(id=job_id, code="JobNotStarted", message="Job hasn't been started yet.", links=[]), 400)
def api_batch_job(job_id): job = JobsPersistence.get_by_id(job_id) if job is None: return flask.make_response( jsonify(id=job_id, code="JobNotFound", message="The job does not exist.", links=[]), 404) if flask.request.method == 'GET': status = job["current_status"] return flask.make_response( jsonify( id=job_id, title=job.get("title", None), description=job.get("description", None), process_graph=json.loads(job["process_graph"]), status=status, # "status" is reserved word in DynamoDB error=job["error_msg"] if status == "error" else None, submitted=job["submitted"], updated=job["last_updated"], ), 200) elif flask.request.method == 'PATCH': if job["current_status"] in ["queued", "running"]: return flask.make_response( jsonify( id=job_id, code="JobLocked", message= 'Job is locked due to a queued or running batch computation.', links=[]), 400) data = flask.request.get_json() errors = PatchJobsSchema().validate(data) if errors: # Response procedure for validation will depend on how openeo_pg_parser_python will work return flask.make_response( jsonify(id=job_id, code=400, message=errors, links=[]), 400) for key in data: JobsPersistence.update_key(job_id, key, data[key]) JobsPersistence.update_status(job_id, "submitted") return flask.make_response('Changes to the job applied successfully.', 204) elif flask.request.method == 'DELETE': JobsPersistence.set_should_be_cancelled(job_id) # wait for job to get status 'cancelled': period = 0.5 # check every X seconds n_checks = int(REQUEST_TIMEOUT / period) for _ in range(n_checks): job = JobsPersistence.get_by_id(job_id) if job["current_status"] in ["canceled"]: break time.sleep(0.5) JobsPersistence.delete(job_id) return flask.make_response('The job has been successfully deleted.', 204)
def teardown_function(function): ProcessGraphsPersistence.clear_table() JobsPersistence.clear_table() ServicesPersistence.clear_table()
def setup_function(function): ProcessGraphsPersistence.ensure_table_exists() JobsPersistence.ensure_table_exists() JobsPersistence.ensure_queue_exists() ServicesPersistence.ensure_table_exists()