def data_check(s3_engine, datastore, action): """ :type s3_engine: dart.engine.s3.s3.S3Engine :type datastore: dart.model.datastore.Datastore :type action: dart.model.action.Action """ action = s3_engine.dart.patch_action(action, progress=.1) args = action.data.args offset = args.get('date_offset_in_seconds') now = datetime.utcnow() s3_path_prefix = substitute_date_tokens(args['s3_path_prefix'], now, offset) bucket_name = get_bucket_name(s3_path_prefix) prefix = get_key_name(s3_path_prefix) last_modified = args.get('s3_file_last_modified') s3_paginator = boto3.client('s3').get_paginator('list_objects') for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix): for element in (page.get('Contents') or []): path = 's3://' + bucket_name + '/' + element['Key'] s3_path_regex = args.get('s3_path_regex') if s3_path_regex and not re.match( substitute_date_tokens(s3_path_regex, now, offset), path): continue if args.get('min_file_size_in_bytes' ) and element['Size'] < args['min_file_size_in_bytes']: continue if last_modified and element['LastModified'] < now + timedelta( seconds=offset): continue return raise Exception('Data check failed')
def data_check(s3_engine, datastore, action): """ :type s3_engine: dart.engine.s3.s3.S3Engine :type datastore: dart.model.datastore.Datastore :type action: dart.model.action.Action """ action = s3_engine.dart.patch_action(action, progress=.1) args = action.data.args now = datetime.utcnow() if args.get('date_offset_in_seconds'): now = now + timedelta(seconds=args['date_offset_in_seconds']) s3_path_prefix = substitute_date_tokens(args['s3_path_prefix'], now) bucket_name = get_bucket_name(s3_path_prefix) prefix = get_key_name(s3_path_prefix) s3_paginator = boto3.client('s3').get_paginator('list_objects') for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix): for element in (page.get('Contents') or []): path = 's3://' + bucket_name + '/' + element['Key'] if args.get('s3_path_regex') and not re.match(substitute_date_tokens(args['s3_path_regex'], now), path): continue if args.get('min_file_size_in_bytes') and element['Size'] < args['min_file_size_in_bytes']: continue return raise Exception('Data check failed')
def _upload_s3_copy_manifests_and_create_tracking_sql_files( action, dataset, datastore, batch_size, s3_path_and_updated_generator): """ :type action: dart.model.action.Action :type dataset: dart.model.dataset.Dataset :type datastore: dart.model.datastore.Datastore """ s3_path_and_updated_iterator = iter(s3_path_and_updated_generator) if dataset.data.load_type == LoadType.RELOAD_LAST: last = None for last in s3_path_and_updated_iterator: pass s3_path_and_updated_iterator = iter([last] if last else []) manifests = [] tracking_sql_files = [] current_part = 1 while True: batch = list(islice(s3_path_and_updated_iterator, batch_size)) if not batch: break with tempfile.NamedTemporaryFile() as f: values = (datastore.data.s3_artifacts_path, action.id, current_part) s3_manifest_path = '%s/load-manifests/load-manifest-for-action-%s-part-%s.json' % values manifests.append(s3_manifest_path) # http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html data = { 'entries': [{ 'mandatory': True, 'url': e[0] } for e in batch] } json.dump(data, f) # now rewind to the beginning of the file so it can be read f.seek(0) bucket_name = get_bucket_name(datastore.data.s3_artifacts_path) key_name = get_key_name(s3_manifest_path) boto3.client('s3').upload_file(f.name, bucket_name, key_name) with tempfile.NamedTemporaryFile(delete=False) as f: tracking_sql_files.append(f.name) schema_name, table_name = get_tracking_schema_and_table_name( action) sql = 'INSERT INTO %s.%s (s3_path, updated) VALUES \n' % ( schema_name, table_name) sql += ',\n'.join([ "('%s', %s)" % (e[0], "'%s'" % e[1].isoformat() if e[1] else 'NULL') for e in batch ]) f.write(sql) current_part += 1 return manifests, tracking_sql_files
def _get_raw_config_data(config_path): if config_path.startswith('s3://'): response = boto3.client('s3').get_object( Bucket=get_bucket_name(config_path), Key=get_key_name(config_path)) return response["Body"].read() # use as is for absolute paths, else make it relative to the project root path = config_path if config_path.startswith( '/') else dart_root_relative_path(config_path) with open(path) as f: return f.read()
def _upload_s3_json_manifest(action, dataset, datastore): # http://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-copy-from-json.html data = {'jsonpaths': ['$.%s' % c.path for c in dataset.data.columns]} values = (datastore.data.s3_artifacts_path, action.id) s3_json_manifest_path = '%s/json-manifests/json-manifest-for-action-%s.json' % values with tempfile.NamedTemporaryFile() as f: json.dump(data, f) # now rewind to the beginning of the file so it can be read f.seek(0) bucket_name = get_bucket_name(datastore.data.s3_artifacts_path) key_name = get_key_name(s3_json_manifest_path) boto3.client('s3').upload_file(f.name, bucket_name, key_name) return s3_json_manifest_path
def create_nudge_subscription(subscription, dataset): """ :type subscription: dart.model.subscription.Subscription :type dataset: dart.model.dataset.Dataset :rtype str """ host_url = current_app.dart_context.config.get('nudge').get('host_url') path = subscription.data.s3_path_start_prefix_inclusive if subscription.data.s3_path_start_prefix_inclusive else dataset.data.location json_body = { 'Bucket': get_bucket_name(path), 'Prefix': get_key_name(path), 'Regex': subscription.data.s3_path_regex_filter, 'Backfill': True } response = requests.post(url='{host_url}/Subscribe'.format(host_url=host_url), json=json_body) return response.json()['SubscriptionId']
def _upload_s3_copy_manifests_and_create_tracking_sql_files(action, dataset, datastore, batch_size, s3_path_and_updated_generator): """ :type action: dart.model.action.Action :type dataset: dart.model.dataset.Dataset :type datastore: dart.model.datastore.Datastore """ s3_path_and_updated_iterator = iter(s3_path_and_updated_generator) if dataset.data.load_type == LoadType.RELOAD_LAST: last = None for last in s3_path_and_updated_iterator: pass s3_path_and_updated_iterator = iter([last] if last else []) manifests = [] tracking_sql_files = [] current_part = 1 while True: batch = list(islice(s3_path_and_updated_iterator, batch_size)) if not batch: break with tempfile.NamedTemporaryFile() as f: values = (datastore.data.s3_artifacts_path, action.id, current_part) s3_manifest_path = '%s/load-manifests/load-manifest-for-action-%s-part-%s.json' % values manifests.append(s3_manifest_path) # http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html data = {'entries': [{'mandatory': True, 'url': e[0]} for e in batch]} json.dump(data, f) # now rewind to the beginning of the file so it can be read f.seek(0) bucket_name = get_bucket_name(datastore.data.s3_artifacts_path) key_name = get_key_name(s3_manifest_path) boto3.client('s3').upload_file(f.name, bucket_name, key_name) with tempfile.NamedTemporaryFile(delete=False) as f: tracking_sql_files.append(f.name) schema_name, table_name = get_tracking_schema_and_table_name(action) sql = 'INSERT INTO %s.%s (s3_path, updated) VALUES \n' % (schema_name, table_name) sql += ',\n'.join(["('%s', %s)" % (e[0], "'%s'" % e[1].isoformat() if e[1] else 'NULL') for e in batch]) f.write(sql) current_part += 1 return manifests, tracking_sql_files
def __init__(self, kms_key_arn, secrets_s3_path): self._kms_key_arn = kms_key_arn self._secrets_s3_path = secrets_s3_path.rstrip('/') self._bucket_name = get_bucket_name(self._secrets_s3_path) self._s3_prefix = get_key_name(self._secrets_s3_path)
def extract_bucket_key(s3_path): return get_bucket_name(s3_path), get_key_name(s3_path)
def create_partial(self, output_config): _logger.info('updating configuration with trigger queue urls/arns') trigger_queue_arn, trigger_queue_url = self._ensure_queue_exists( output_config, 'trigger_queue') events_params = output_config['cloudformation_stacks']['events'][ 'boto_args']['Parameters'] _get_element(events_params, 'ParameterKey', 'TriggerQueueUrl')['ParameterValue'] = trigger_queue_url _get_element(events_params, 'ParameterKey', 'TriggerQueueArn')['ParameterValue'] = trigger_queue_arn _logger.info('creating initial stacks') events_stack_name = self._create_stack('events', self.mode, output_config) rds_stack_name = self._create_stack('rds', self.mode, output_config) elb_stack_name = self._create_stack('elb', self.mode, output_config) elb_int_stack_name = self._create_stack('elb-internal', self.mode, output_config) engine_taskrunner_stack_name = self._create_stack( 'engine-taskrunner', self.mode, output_config) _logger.info('waiting for stack completion') events_outputs = self._wait_for_stack_completion_and_get_outputs( events_stack_name, 1) rds_outputs = self._wait_for_stack_completion_and_get_outputs( rds_stack_name, 1) elb_outputs = self._wait_for_stack_completion_and_get_outputs( elb_stack_name, 1) elb_int_outputs = self._wait_for_stack_completion_and_get_outputs( elb_int_stack_name, 1) engine_taskrunner_outputs = self._wait_for_stack_completion_and_get_outputs( engine_taskrunner_stack_name, 1) _logger.info( 'updating configuration with new cloudwatch scheduled events sns topic name' ) sns_arn = events_outputs[0]['OutputValue'] output_config['triggers']['scheduled'][ 'cloudwatch_scheduled_events_sns_arn'] = sns_arn _logger.info( 'updating configuration with new rds endpoint and password') db_uri_secret_key = 'database-uri-%s' % self.environment_name output_config['flask'][ 'SQLALCHEMY_DATABASE_URI'] = '!decrypt %s' % db_uri_secret_key secrets_config = get_secrets_config(output_config) secrets_service = Secrets(secrets_config['kms_key_arn'], secrets_config['secrets_s3_path']) rds_pwd = os.environ['DART_RDS_PASSWORD'] rds_host = rds_outputs[0]['OutputValue'] secrets_service.put( db_uri_secret_key, 'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host)) _logger.info('updating configuration with new elb name') web_params = output_config['cloudformation_stacks']['web'][ 'boto_args']['Parameters'] elb_name_param = _get_element(web_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName') elb_name = elb_outputs[0]['OutputValue'] elb_name_param['ParameterValue'] = elb_name _logger.info('updating configuration with new internal elb name') web_int_params = output_config['cloudformation_stacks'][ 'web-internal']['boto_args']['Parameters'] elb_int_name_param = _get_element(web_int_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName') elb_int_name = elb_int_outputs[0]['OutputValue'] elb_int_name_param['ParameterValue'] = elb_int_name _logger.info( 'updating configuration with new engine taskrunner ecs cluster name' ) output_config['dart'][ 'engine_taskrunner_ecs_cluster'] = engine_taskrunner_outputs[0][ 'OutputValue'] _logger.info( 'updating configuration with encrypted dart email username/password' ) mailer_options = output_config['email']['mailer'] mailer_options['usr'] = '******' mailer_options['pwd'] = '!decrypt email-password' secrets_service.put('email-username', self.dart_email_username) secrets_service.put('email-password', self.dart_email_password) _logger.info('uploading the output configuration to s3') body = yaml.dump(output_config, default_flow_style=False) body = re.sub(r"'!decrypt (.+?)'", r"!decrypt \1", body) body = re.sub(r"'!env (.+?)'", r"!env \1", body) body = re.sub(r"__DARTBANG__", r"!", body) body = re.sub(r"__DARTQUOTE__", r"'", body) body = re.sub(r"__DARTDOLLAR__", r"$", body) boto3.client('s3').put_object( Bucket=get_bucket_name(self.output_config_s3_path), Key=get_key_name(self.output_config_s3_path), Body=body) _logger.info('creating and waiting for web stacks') web_stack_name = self._create_stack('web', self.mode, output_config) web_internal_stack_name = self._create_stack('web-internal', self.mode, output_config) web_outputs = self._wait_for_stack_completion_and_get_outputs( web_stack_name, 2) self._wait_for_stack_completion_and_get_outputs( web_internal_stack_name) _logger.info('waiting for web ecs service to stabilize') cluster_name = _get_element(web_outputs, 'OutputKey', 'EcsClusterResourceName')['OutputValue'] service_name = _get_element(web_outputs, 'OutputKey', 'WebEcsServiceResourceName')['OutputValue'] boto3.client('ecs').get_waiter('services_stable').wait( cluster=cluster_name, services=[service_name]) _logger.info('done') _logger.info('waiting for web app to attach to load balancer') self._wait_for_web_app(elb_name) time.sleep(5) _logger.info('initializing database schema') dart_host = _get_dart_host(output_config) response = requests.post('http://%s/admin/create_all' % dart_host) response.raise_for_status() time.sleep(5) _logger.info('creating database triggers') with open(dart_root_relative_path('src', 'database', 'triggers.sql')) as f: engine = sqlalchemy.create_engine( 'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host)) engine.execute(f.read()) _logger.info('done') time.sleep(5) _logger.info('adding engines') self._with_retries(add_no_op_engine, output_config) self._with_retries(add_no_op_engine_sub_graphs, output_config) self._with_retries(add_emr_engine, output_config) self._with_retries(add_emr_engine_sub_graphs, output_config) self._with_retries(add_dynamodb_engine, output_config) self._with_retries(add_redshift_engine, output_config) self._with_retries(add_s3_engine, output_config) _logger.info('creating and waiting for remaining stacks') engine_worker_stack_name = self._create_stack('engine-worker', self.mode, output_config) trigger_worker_stack_name = self._create_stack('trigger-worker', self.mode, output_config) subscription_worker_stack_name = self._create_stack( 'subscription-worker', self.mode, output_config) self._wait_for_stack_completion_and_get_outputs( engine_worker_stack_name) self._wait_for_stack_completion_and_get_outputs( trigger_worker_stack_name) self._wait_for_stack_completion_and_get_outputs( subscription_worker_stack_name)
def create_partial(self, output_config): _logger.info('updating configuration with trigger queue urls/arns') trigger_queue_arn, trigger_queue_url = self._ensure_queue_exists(output_config, 'trigger_queue') events_params = output_config['cloudformation_stacks']['events']['boto_args']['Parameters'] self._get_element(events_params, 'ParameterKey', 'TriggerQueueUrl')['ParameterValue'] = trigger_queue_url self._get_element(events_params, 'ParameterKey', 'TriggerQueueArn')['ParameterValue'] = trigger_queue_arn _logger.info('creating initial stacks') events_stack_name = self._create_stack('events', output_config) rds_stack_name = self._create_stack('rds', output_config) elb_stack_name = self._create_stack('elb', output_config) elb_int_stack_name = self._create_stack('elb-internal', output_config) engine_taskrunner_stack_name = self._create_stack('engine-taskrunner', output_config) _logger.info('waiting for stack completion') events_outputs = self._wait_for_stack_completion_and_get_outputs(events_stack_name, 1) rds_outputs = self._wait_for_stack_completion_and_get_outputs(rds_stack_name, 1) elb_outputs = self._wait_for_stack_completion_and_get_outputs(elb_stack_name, 1) elb_int_outputs = self._wait_for_stack_completion_and_get_outputs(elb_int_stack_name, 1) engine_taskrunner_outputs = self._wait_for_stack_completion_and_get_outputs(engine_taskrunner_stack_name, 1) _logger.info('updating configuration with new cloudwatch scheduled events sns topic name') sns_arn = events_outputs[0]['OutputValue'] output_config['triggers']['scheduled']['cloudwatch_scheduled_events_sns_arn'] = sns_arn _logger.info('updating configuration with new rds endpoint and password') db_uri_secret_key = 'database-uri-%s' % self.environment_name output_config['flask']['SQLALCHEMY_DATABASE_URI'] = '!decrypt %s' % db_uri_secret_key secrets_config = get_secrets_config(output_config) secrets_service = Secrets(secrets_config['kms_key_arn'], secrets_config['secrets_s3_path']) rds_pwd = os.environ['DART_RDS_PASSWORD'] rds_host = rds_outputs[0]['OutputValue'] secrets_service.put(db_uri_secret_key, 'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host)) _logger.info('updating configuration with new elb name') web_params = output_config['cloudformation_stacks']['web']['boto_args']['Parameters'] elb_name_param = self._get_element(web_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName') elb_name = elb_outputs[0]['OutputValue'] elb_name_param['ParameterValue'] = elb_name _logger.info('updating configuration with new internal elb name') web_int_params = output_config['cloudformation_stacks']['web-internal']['boto_args']['Parameters'] elb_int_name_param = self._get_element(web_int_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName') elb_int_name = elb_int_outputs[0]['OutputValue'] elb_int_name_param['ParameterValue'] = elb_int_name _logger.info('updating configuration with new engine taskrunner ecs cluster name') output_config['dart']['engine_taskrunner_ecs_cluster'] = engine_taskrunner_outputs[0]['OutputValue'] _logger.info('updating configuration with encrypted dart email username/password') mailer_options = output_config['email']['mailer'] mailer_options['usr'] = '******' mailer_options['pwd'] = '!decrypt email-password' secrets_service.put('email-username', self.dart_email_username) secrets_service.put('email-password', self.dart_email_password) _logger.info('uploading the output configuration to s3') body = yaml.dump(output_config, default_flow_style=False) body = re.sub(r"'!decrypt (.+?)'", r"!decrypt \1", body) body = re.sub(r"'!env (.+?)'", r"!env \1", body) body = re.sub(r"__DARTBANG__", r"!", body) body = re.sub(r"__DARTQUOTE__", r"'", body) body = re.sub(r"__DARTDOLLAR__", r"$", body) boto3.client('s3').put_object( Bucket=get_bucket_name(self.output_config_s3_path), Key=get_key_name(self.output_config_s3_path), Body=body ) _logger.info('creating and waiting for web stacks') web_stack_name = self._create_stack('web', output_config) web_internal_stack_name = self._create_stack('web-internal', output_config) web_outputs = self._wait_for_stack_completion_and_get_outputs(web_stack_name, 2) self._wait_for_stack_completion_and_get_outputs(web_internal_stack_name) _logger.info('waiting for web ecs service to stabilize') cluster_name = self._get_element(web_outputs, 'OutputKey', 'EcsClusterResourceName')['OutputValue'] service_name = self._get_element(web_outputs, 'OutputKey', 'WebEcsServiceResourceName')['OutputValue'] boto3.client('ecs').get_waiter('services_stable').wait(cluster=cluster_name, services=[service_name]) _logger.info('done') _logger.info('waiting for web app to attach to load balancer') self._wait_for_web_app(elb_name) time.sleep(5) _logger.info('initializing database schema') dart_host = self._get_dart_host(output_config) response = requests.post('http://%s/admin/create_all' % dart_host) response.raise_for_status() time.sleep(5) _logger.info('creating database triggers') with open(dart_root_relative_path('src', 'database', 'triggers.sql')) as f: engine = sqlalchemy.create_engine('postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host)) engine.execute(f.read()) _logger.info('done') time.sleep(5) _logger.info('adding engines') add_no_op_engine(output_config) add_no_op_engine_sub_graphs(output_config) add_emr_engine(output_config) add_emr_engine_sub_graphs(output_config) add_redshift_engine(output_config) _logger.info('creating and waiting for remaining stacks') engine_worker_stack_name = self._create_stack('engine-worker', output_config) trigger_worker_stack_name = self._create_stack('trigger-worker', output_config) subscription_worker_stack_name = self._create_stack('subscription-worker', output_config) self._wait_for_stack_completion_and_get_outputs(engine_worker_stack_name) self._wait_for_stack_completion_and_get_outputs(trigger_worker_stack_name) self._wait_for_stack_completion_and_get_outputs(subscription_worker_stack_name)