def __init__(self, config_loc, config_override_loc, emailer, num_processes=1, wait_timeout_sec=60): """ :param config_loc: path of config.yaml :type config_loc: string :param config_override_loc: path of config-env-dev.yaml :type config_override_loc: string :param run_local: run local flag :type run_local: boolean :param num_processes: number of worker processes to use for sqs request :type num_processes: int :param wait_timeout_sec: A timeout passed to conditional variable wait function. If thread is woken up on timeout, do some maintenance work. :type wait_timeout_sec: int """ self._config_loc = config_loc self._config_override_loc = config_override_loc self._stop_requested = False self._run_once = False self.max_error_retries = staticconf.read_int('max_error_retries') self.etl_helper = ETLStatusHelper() self.jobs_db = TableConnection.get_connection('ScheduledJobs') self.runs_db = TableConnection.get_connection('ETLRecords') self._num_processes = num_processes self._cond = threading.Condition(threading.Lock()) self._wait_timeout_sec = max(wait_timeout_sec, 60) self.emailer = emailer
def et_scanner_main(args): """ Create an instance of ETScanner and run it once. """ setup_config(args, 'ETScanner') sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name")) sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name")) scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'), sqs_scanner_queue, sqs_worker_queue, Mailer(args.run_local)) scanner.run()
def post_job(scheduled_jobs_object, et_scanner_sqs, request_body_str): """ the request body should be a dictionary (so \*\*request_body are kwargs), and it with the following required keys: * redshift_id * log_name * log_schema_version * start_date * s3_path * contact_emails :param scheduled_jobs_object: the ScheduledJobs to which we post jobs :type scheduled_jobs_object: an instance of ScheduledJobs :param et_scanner_sqs: the scanner sqs to send message to :type et_scanner_sqs: SQSWrapper object :param request_body_str: a string version of the request body dict :type request_body_str: string :param scheduled_jobs_object: an instance of ScheduledJobs :type scheduled_jobs_object: ScheduledJobs :returns: success :rtype: boolean :raises S3ResponseError: if the bytes written don't match the length of the content """ request_body_dict = simplejson.loads(request_body_str) if "contact_emails" in request_body_dict and request_body_dict["contact_emails"] is not None: request_body_dict["contact_emails"] = set(request_body_dict["contact_emails"]) _check_required_args(request_body_dict, "post") s_date = datetime.datetime.strptime(request_body_dict["start_date"], "%Y-%m-%d") if "end_date" not in request_body_dict: request_body_dict["end_date"] = None elif request_body_dict["end_date"] is not None: e_date = datetime.datetime.strptime(request_body_dict["end_date"], "%Y-%m-%d") if s_date > e_date: raise ValueError("start date should not be greater than end date") if not request_body_dict.get("log_format"): request_body_dict["log_format"] = "json" request_body_dict["additional_arguments"] = _validate_additional_args(request_body_dict) # check that redshift cluster exists, throws ItemNotFound list_cluster_by_name(TableConnection.get_connection("RedshiftClusters"), request_body_dict["redshift_id"]) request_body_dict["hash_key"] = _create_hash_key(request_body_dict) request_body_dict["uuid"] = _get_uuid(scheduled_jobs_object) request_body_dict["et_status"] = NULL ret = scheduled_jobs_object.put(**request_body_dict) if ret: dummy_message = {"message": "dummy"} # TODO: use meaningful message instead of dummy et_scanner_sqs.write_message_to_queue(dummy_message) return {"post_accepted": {"result": ret, "uuid": request_body_dict["uuid"]}}
def runs_filtered(request): """ runs_filtered handles requests from the runs endpoint with a job_id. All runs for the job_id requested are returned. **GET /v1/runs/**\ *{string: job_id}* **Query Parameters:** * **job_id** - the job_id of the runs we wish to review Example: ``/v1/runs/234332104332`` *Example Response* :: [ { ‘job_id’: <string>, ‘etl_status’: <string>, ‘last_updated’: timestamp, ‘data_date’: ‘YYYY-mm-dd’, ‘schema_checksum’: <string>, ‘s3_path’: <string>, ‘et_starttime’: timestamp, ‘et_runtime’: int, ‘load_starttime’: timestamp, ‘load_runtime’: int, ‘redshift_id’: <string>, ‘db_schema’: <string>, ‘run_by’: <string>, 'additional_arguments': <string> } ] ============ =========== Status Code Description ============ =========== **200** Success **404** Invalid job_id **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ job_id = request.matchdict.get('job_id', None) try: return 200, list_runs_by_job_id(job_id, TableConnection.get_connection('ETLRecords')) except ValueError as e: return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def jobs_update_job(request): """ jobs_update_job_by_job_id handles requests from the jobs endpoint. **PUT /v1/jobs/job/** Example: ``v1/jobs/job/`` **Query Parameters:** * **request.body** -- the json string of job details *Example request.body* :: "{ 'log_name': 'ad_click', 'log_schema_version': 'initial', 'start_date': '2014-04-01', 'end_date': '', 'redshift_id': 'rs1', 'cancel_requested': True, }" ============ =========== Status Code Description ============ =========== **200** Success **400** bad hash_key: redshift_id, log_name, log_schema_version and start_date must all be present **404** invalid job parameters **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ try: return 200, put_job(TableConnection.get_connection('ScheduledJobs'), get_scanner_queue('et'), request.body) except PrimaryKeyError as e: return 400, {'error': 'bad hash_key'} except JSONDecodeError as e: return 400, {'error': 'json decode error'} except ValueError as e: return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def cluster_by_name(request): """ cluster_by_name returns a dictionary with key "clusters" and value of a list of clusters with one entry -- the cluster specified in the route **GET /v1/clusters/**\ *{string: cluster_name}* **Query Parameters:** * **cluster_name** - the name of the log for which we want to see clusters Example: ``/v1/clusters/cluster-1`` *Example Response* :: { 'clusters': [ { 'redshift_id': 'cluster-1', 'port': 5439, 'host': 'cluster-1.account.region.redshift.amazonaws.com', 'db_schema': 'public', 'groups': ['search_infra', 'biz'] }] } ============ =========== Status Code Description ============ =========== **200** Success **404** invalid cluster_name **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ cluster_name = request.matchdict.get('cluster_name') try: return 200, list_cluster_by_name( TableConnection.get_connection('RedshiftClusters'), cluster_name) except ValueError as e: return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def jobs_update_job(request): """ jobs_update_job_by_job_id handles requests from the jobs endpoint. **PUT /v1/jobs/job/** Example: ``v1/jobs/job/`` **Query Parameters:** * **request.body** -- the json string of job details *Example request.body* :: "{ 'log_name': 'ad_click', 'log_schema_version': 'initial', 'start_date': '2014-04-01', 'end_date': '', 'redshift_id': 'rs1', 'cancel_requested': True, }" ============ =========== Status Code Description ============ =========== **200** Success **400** bad hash_key: redshift_id, log_name, log_schema_version and start_date must all be present **404** invalid job parameters **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ try: return 200, put_job(TableConnection.get_connection("ScheduledJobs"), get_scanner_queue("et"), request.body) except PrimaryKeyError as e: return 400, {"error": "bad hash_key"} except JSONDecodeError as e: return 400, {"error": "json decode error"} except ValueError as e: return 404, {"error": repr(e)} except Exception as unknown_exception: return 500, {"error": repr(unknown_exception)}
def _get_redshift_cluster_details(self, rs_id): """ Get the host and port for a particular redshift id :param rs_id: redshift id of a cluster e.g., cluster-1 :type rs_id: string :returns: a 2 tuple containing a redshift host name and redshift port :rtype: tuple """ cluster = list_cluster_by_name( TableConnection.get_connection('RedshiftClusters'), rs_id) if cluster: port = cluster['port'] host = cluster['host'] schema = cluster['db_schema'] else: raise ValueError("No cluster named: {0}".format(rs_id)) return host, port, schema
def clusters(request): """ clusters handles GET and POST requests from the clusters endpoint **GET /v1/clusters/** Example: ``/v1/clusters/`` *Example Response* :: [ { 'redshift_id': 'cluster-1', 'port': 5439, 'host': 'cluster-1.account.region.redshift.amazonaws.com', 'db_schema': 'public', 'groups': ['search_infra', 'biz'] }, { 'redshift_id': 'cluster-1-user', 'port': 5439, 'host': 'cluster-1-user.account.region.redshift.amazonaws.com', 'db_schema': 'public', 'groups': ['search_infra', 'log_infra'] }, { 'redshift_id': 'cluster-2', 'port': 5439, 'host': cluster-2.account.region.redshift.amazonaws.com, 'db_schema': 'public', 'groups': ['mobile', 'log_infra'] }, ] ============ =========== Status Code Description ============ =========== **200** Success **500** unknown exception ============ =========== * **Encoding type:** *application/json* **POST /v1/clusters/** Example: ``/v1/clusters`` **Query Parameters:** * **request.body** -- the json string of cluster details *Example request.body* :: "{ 'redshift_id': 'cluster-2', 'port': 5439, 'host': 'cluster-2.account.region.redshift.amazonaws.com' }" ============ =========== Status Code Description ============ =========== **200** Success **404** invalid cluster parameters **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ try: if request.method == "POST": return 200, post_cluster( TableConnection.get_connection('RedshiftClusters'), request.body) elif request.method == "GET": return 200, list_all_clusters( TableConnection.get_connection('RedshiftClusters')) except PrimaryKeyError as e: return 400, {'error': 'bad hash_key or missing required arguments'} except ValueError as e: if "ConditionalCheckFailedException" in repr(e): return 404, { 'error': "ConditionalCheckFailed; possible duplicate cluster" } return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def jobs_filtered(request): """ jobs_filtered handles requests from the jobs endpoint with a log_name and optional version. If there's no version all jobs will be for the given log_name will be returned, otherwise all jobs for the log name and version combination will be returned. **GET /v1/jobs/**\ *{string: log_name}* **Query Parameters:** * **log_name** - the name of the log for which we want to see jobs Example: ``/v1/jobs/ad_click`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"load_step": ["--force-load"]}' }, {'log_name': 'ad_click', 'log_schema_version': 'minimal', 's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy 'start_date': '2014-05-01', 'end_date': '2014-05-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"load_step": ["--force-load"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **404** invalid log_name **500** unknown exception ============ =========== **GET /v1/jobs/**\ *{string: log_name}/{string: log_schema_version}* **Query Parameters:** * **log_name** - the name of the log for which we want to see jobs * **log_schema_version** - the version of the log for which we want to see jobs Example: ``/v1/jobs/ad_click/initial`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"et_step": ["--force-et"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **404** invalid log_name or log_version **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ log_name = request.matchdict.get('log_name') log_version = request.matchdict.get('log_schema_version', None) try: if log_version is None: return 200, list_jobs_by_name( log_name, TableConnection.get_connection('ScheduledJobs')) return 200, list_jobs_by_name_version( log_name, log_version, TableConnection.get_connection('ScheduledJobs')) except ValueError as e: return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def __init__(self): self.etl_db = TableConnection.get_connection('ETLRecords') self.worker_id = '{0}:{1}'.format(socket.gethostname(), os.getpid())
def _table_conn(self): if not hasattr(self, '_cached_table_conn'): self._configure_mycroft() self._cached_table_conn = TableConnection.get_connection( 'RedshiftClusters') return self._cached_table_conn
def _table_conn(self): if not hasattr(self, '_cached_table_conn'): self._configure_mycroft() self._cached_table_conn = TableConnection.get_connection('RedshiftClusters') return self._cached_table_conn
def jobs_filtered(request): """ jobs_filtered handles requests from the jobs endpoint with a log_name and optional version. If there's no version all jobs will be for the given log_name will be returned, otherwise all jobs for the log name and version combination will be returned. **GET /v1/jobs/**\ *{string: log_name}* **Query Parameters:** * **log_name** - the name of the log for which we want to see jobs Example: ``/v1/jobs/ad_click`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"load_step": ["--force-load"]}' }, {'log_name': 'ad_click', 'log_schema_version': 'minimal', 's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy 'start_date': '2014-05-01', 'end_date': '2014-05-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"load_step": ["--force-load"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **404** invalid log_name **500** unknown exception ============ =========== **GET /v1/jobs/**\ *{string: log_name}/{string: log_schema_version}* **Query Parameters:** * **log_name** - the name of the log for which we want to see jobs * **log_schema_version** - the version of the log for which we want to see jobs Example: ``/v1/jobs/ad_click/initial`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"et_step": ["--force-et"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **404** invalid log_name or log_version **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ log_name = request.matchdict.get("log_name") log_version = request.matchdict.get("log_schema_version", None) try: if log_version is None: return 200, list_jobs_by_name(log_name, TableConnection.get_connection("ScheduledJobs")) return 200, list_jobs_by_name_version(log_name, log_version, TableConnection.get_connection("ScheduledJobs")) except ValueError as e: return 404, {"error": repr(e)} except Exception as unknown_exception: return 500, {"error": repr(unknown_exception)}
def jobs(request): """ jobs_name_and_version handles requests from the jobs endpoint with log_name and log_version, getting contents from the dynamo location **GET /v1/jobs/** Example: ``/v1/jobs/`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"et_step": ["--force-et"]}' }, {'log_name': 'ad_click', 'log_schema_version': 'minimal', 's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy 'start_date': '2014-05-01', 'end_date': '2014-05-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123' 'additional_arguments': '{"et_step": ["--force-et"]}' }, {'log_name': 'bing_geocoder', 'log_schema_version': 'bing2', 's3_log_uri': http://bing/schema.yaml?Signature=b?Expires=a?AccessKeyId=zzz 'start_date': '2014-05-02', 'end_date': '2014-06-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123' 'additional_arguments': '{"et_step": ["--force-et"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **500** unknown exception ============ =========== * **Encoding type:** *application/json* **POST /v1/jobs/** Example: ``v1/jobs`` **Query Parameters:** * **request.body** -- the json string of job details *Example request.body* :: "{ 'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': 'llll', 'start_date': '2014-04-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'rs1', 'additional_arguments': '{"load_step": ["--force-load"]}' }" ============ =========== Status Code Description ============ =========== **200** Success **400** bad hash_key: redshift_id, log_name, log_schema_version and start_date must all be present **404** invalid job parameters **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ try: if request.method == "POST": return 200, post_job( TableConnection.get_connection('ScheduledJobs'), get_scanner_queue('et'), request.body) elif request.method == "GET": return 200, list_all_jobs( TableConnection.get_connection('ScheduledJobs')) except PrimaryKeyError as e: return 400, {'error': 'bad hash_key'} except ValueError as e: if "ConditionalCheckFailedException" in repr(e): return 404, { 'error': "ConditionalCheckFailed; possible duplicate job. \ Delete existing job first" } return 404, {'error': repr(e)} except Exception as unknown_exception: return 500, {'error': repr(unknown_exception)}
def jobs(request): """ jobs_name_and_version handles requests from the jobs endpoint with log_name and log_version, getting contents from the dynamo location **GET /v1/jobs/** Example: ``/v1/jobs/`` *Example Response* :: [ {'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx 'start_date': '2014-05-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123', 'additional_arguments': '{"et_step": ["--force-et"]}' }, {'log_name': 'ad_click', 'log_schema_version': 'minimal', 's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy 'start_date': '2014-05-01', 'end_date': '2014-05-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123' 'additional_arguments': '{"et_step": ["--force-et"]}' }, {'log_name': 'bing_geocoder', 'log_schema_version': 'bing2', 's3_log_uri': http://bing/schema.yaml?Signature=b?Expires=a?AccessKeyId=zzz 'start_date': '2014-05-02', 'end_date': '2014-06-07', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'abc123' 'additional_arguments': '{"et_step": ["--force-et"]}' } ] ============ =========== Status Code Description ============ =========== **200** Success **500** unknown exception ============ =========== * **Encoding type:** *application/json* **POST /v1/jobs/** Example: ``v1/jobs`` **Query Parameters:** * **request.body** -- the json string of job details *Example request.body* :: "{ 'log_name': 'ad_click', 'log_schema_version': 'initial', 's3_log_uri': 'llll', 'start_date': '2014-04-01', 'end_date': '', 'contact_emails': ['*****@*****.**', '*****@*****.**'], 'redshift_id': 'rs1', 'additional_arguments': '{"load_step": ["--force-load"]}' }" ============ =========== Status Code Description ============ =========== **200** Success **400** bad hash_key: redshift_id, log_name, log_schema_version and start_date must all be present **404** invalid job parameters **500** unknown exception ============ =========== * **Encoding type:** *application/json* """ try: if request.method == "POST": return 200, post_job(TableConnection.get_connection("ScheduledJobs"), get_scanner_queue("et"), request.body) elif request.method == "GET": return 200, list_all_jobs(TableConnection.get_connection("ScheduledJobs")) except PrimaryKeyError as e: return 400, {"error": "bad hash_key"} except ValueError as e: if "ConditionalCheckFailedException" in repr(e): return ( 404, { "error": "ConditionalCheckFailed; possible duplicate job. \ Delete existing job first" }, ) return 404, {"error": repr(e)} except Exception as unknown_exception: return 500, {"error": repr(unknown_exception)}
def post_job(scheduled_jobs_object, et_scanner_sqs, request_body_str): """ the request body should be a dictionary (so \*\*request_body are kwargs), and it with the following required keys: * redshift_id * log_name * log_schema_version * start_date * s3_path * contact_emails :param scheduled_jobs_object: the ScheduledJobs to which we post jobs :type scheduled_jobs_object: an instance of ScheduledJobs :param et_scanner_sqs: the scanner sqs to send message to :type et_scanner_sqs: SQSWrapper object :param request_body_str: a string version of the request body dict :type request_body_str: string :param scheduled_jobs_object: an instance of ScheduledJobs :type scheduled_jobs_object: ScheduledJobs :returns: success :rtype: boolean :raises S3ResponseError: if the bytes written don't match the length of the content """ request_body_dict = simplejson.loads(request_body_str) if 'contact_emails' in request_body_dict and request_body_dict[ 'contact_emails'] is not None: request_body_dict['contact_emails'] = set( request_body_dict['contact_emails']) _check_required_args(request_body_dict, 'post') s_date = datetime.datetime.strptime(request_body_dict['start_date'], "%Y-%m-%d") if 'end_date' not in request_body_dict: request_body_dict['end_date'] = None elif request_body_dict['end_date'] is not None: e_date = datetime.datetime.strptime(request_body_dict['end_date'], "%Y-%m-%d") if s_date > e_date: raise ValueError("start date should not be greater than end date") if not request_body_dict.get('log_format'): request_body_dict['log_format'] = 'json' request_body_dict['additional_arguments'] = _validate_additional_args( request_body_dict) # check that redshift cluster exists, throws ItemNotFound list_cluster_by_name(TableConnection.get_connection('RedshiftClusters'), request_body_dict['redshift_id']) request_body_dict['hash_key'] = _create_hash_key(request_body_dict) request_body_dict['uuid'] = _get_uuid(scheduled_jobs_object) request_body_dict['et_status'] = NULL ret = scheduled_jobs_object.put(**request_body_dict) if ret: dummy_message = { 'message': 'dummy' } # TODO: use meaningful message instead of dummy et_scanner_sqs.write_message_to_queue(dummy_message) return { 'post_accepted': { 'result': ret, 'uuid': request_body_dict['uuid'] } }