def _get_metadata(self, url): response = self._get(url) if response.status_code != 200: log_add(metadata_error_code=response.status_code, metadata_url=url) response.raise_for_status() data = response.json() return data
def write_to_kinesis(events, stream_name): records = [{ "Data": json.dumps(event) + "\n", "PartitionKey": str(uuid.uuid4()) } for event in events] log_add(num_records=len(records)) kinesis_client.put_records(StreamName=stream_name, Records=records)
def say_hello(event, context): log_add(relevant_information="Hello from Python blueprint!") return { "statusCode": 200, "headers": {}, "body": json.dumps({"hello": "world!"}), }
def validate(self, data): raw_errors = self.validator.iter_errors(data) log_add(raw_errors=raw_errors) errors = [] for e in raw_errors: error = {"message": e.message, "row": "root"} path_len = len(e.path) if path_len > 0: error["row"] = e.path[0] if path_len > 1: error["col"] = e.path[1] errors.append(error) return errors
def resolve_s3_sources(self, source_prefix: str): source_objects = self.list_objects_contents(source_prefix) log_add(num_source_objects=len(source_objects)) if not source_objects: raise Exception(f"No source files found at: {source_prefix}") s3_sources = [] for obj in source_objects: source_key = obj["Key"] filename = source_key.split("/")[-1] s3_sources.append(S3Source(filename=filename, key=source_key)) return s3_sources
def invoke_lambda(event, context): config = Config.from_lambda_event(event) function_arn = config.payload.pipeline.task_config.get( config.task).get("arn") log_add(function_arn=function_arn) log_add(event=event) response = lambda_client.invoke( FunctionName=function_arn, Payload=json.dumps(event), InvocationType="RequestResponse", ) result = read_result(response) return result
def validate_json(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_dict(config.task_config) step_data = config.payload.step_data log_add( dataset_id=config.payload.output_dataset.id, version=config.payload.output_dataset.version, edition=config.payload.output_dataset.edition, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) if step_config.schema is None: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], )) input_data = resolve_input_data(step_data) validation_errors = JsonSchemaValidator( step_config.schema).validate_list(input_data) if validation_errors: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_FAILED", errors=validation_errors[:100], )) return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], ))
def write_kinesis(event, context): pipeline_config = Config.from_lambda_event(event) dataset_id = pipeline_config.payload.output_dataset.id version = pipeline_config.payload.output_dataset.version log_add(dataset_id=dataset_id, version=version) dataset = dataset_client.get_dataset(dataset_id, retries=3) access_rights = dataset["accessRights"] confidentiality = CONFIDENTIALITY_MAP[access_rights] output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json" log_add(output_stream_name=output_stream_name) input_events = pipeline_config.payload.step_data.input_events write_to_kinesis(events=input_events, stream_name=output_stream_name) return asdict(StepData(input_events=input_events, status="OK", errors=[]))
def delete_from_prefix(self, s3_prefix): objects_to_delete = [{ "Key": obj["Key"] } for obj in self.list_objects_contents(s3_prefix)] if not objects_to_delete: return self.client.delete_objects( Bucket=self.bucket, Delete={ "Objects": [{ "Key": s3_object["Key"] } for s3_object in objects_to_delete], "Quiet": True, }, ) log_add(deleted_from_s3_path=objects_to_delete)
def read_csv(self): s3_objects = self._list_s3_objects() schema = self.task_config.schema log_add(schema=schema) log_add(s3_keys=[obj["Key"] for obj in s3_objects]) files = [] for s3_object in s3_objects: key = self.s3fs_prefix + s3_object["Key"] dtype = Exporter.get_dtype(schema, key) df = self._read_csv_data( key, delimiter=self.task_config.delimiter, chunksize=self.task_config.chunksize, dtype=dtype, ) filename = key.split("/")[-1] filename = Exporter.remove_suffix(filename) files.append((filename, df)) return files
def validate_csv(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_task_config(config.task_config) s3_prefix = config.payload.output_dataset.s3_prefix log_add( header_row=step_config.header_row, delimiter=step_config.delimiter, quote=step_config.quote, schema=step_config.schema, output_prefix=s3_prefix, ) if not step_config.schema: log_add(notice="No Schema provided for validation") config.payload.step_data.status = Status.VALIDATION_SUCCESS.value # 2020.06: Validation done optionally - we now return ok if we don't supply a # schema for the validation step return asdict(config.payload.step_data) input_prefix = next( iter(config.payload.step_data.s3_input_prefixes.values())) log_add(s3_input_prefix=input_prefix) objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix) s3_path = next(iter(objects["Contents"]))["Key"] log_add(s3_input_path=s3_path) response = s3.get_object(Bucket=BUCKET, Key=s3_path) reader = csv.reader( string_reader.from_response(response), dialect="unix", delimiter=step_config.delimiter, quotechar=step_config.quote, ) header = None if step_config.header_row: header = next(reader) try: csv_data = parse_csv(reader, step_config.schema, header) except ParseErrors as p: return _with_error(config, p.errors) validation_errors = JsonSchemaValidator( step_config.schema).validate(csv_data) if validation_errors: return _with_error(config, errors=validation_errors) config.payload.step_data.status = Status.VALIDATION_SUCCESS.value return asdict(config.payload.step_data)
def export(self): inputs = self.read_csv() s3_prefix = self.s3_prefix() outputs = [] schema = self.task_config.schema errors = [] try: for filename, source in inputs: out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}" if self.task_config.chunksize: outputs.extend( self._parallel_export(filename, source, schema, out_prefix)) else: outputs.append(self._export(source, schema, out_prefix)) except OutOfBoundsDatetime as e: errors.append({"error": "OutOfBoundsDatetime", "message": str(e)}) except ValueError as e: errors.append({"error": "ValueError", "message": str(e)}) if len(errors) > 0: log_add(errors=errors) return asdict( StepData( status="CONVERSION_FAILED", errors=errors, s3_input_prefixes={ self.config.payload.output_dataset.id: s3_prefix }, )) log_add(parquetfiles=outputs) return asdict( StepData( status="CONVERSION_SUCCESS", errors=[], s3_input_prefixes={ self.config.payload.output_dataset.id: s3_prefix }, ))
def generate_signed_url_public(event, context): dataset_id, version_id, edition_id = _dataset_components_from_event(event) client = APIClient() try: dataset = client.get_dataset(dataset_id) edition = client.get_edition(dataset_id, version_id, edition_id) log_add(dataset=dataset) except requests.HTTPError as e: log_exception(e) return error_response(e.response.status_code, e.response.json()) except Exception as e: log_exception(e) return error_response(500, "Could not complete request, please try again later") if not client.has_distributions(edition): return error_response(404, f"Missing data for {edition['Id']}") if dataset["accessRights"] != "public": return error_response(403, "Forbidden") signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset) return response(200, json.dumps(signed_urls))
def generate_signed_urls(bucket, dataset, edition): access_rights = dataset["accessRights"] dataset_id, version, edition_id = edition["Id"].split("/") confidentiality = CONFIDENTIALITY_MAP[access_rights] common_prefix = f"processed/{confidentiality}/" parent_id = dataset.get("parent_id") dataset_prefix = f"{dataset_id}/version={version}/edition={edition_id}/" if parent_id: dataset_prefix = f"{parent_id}/{dataset_prefix}" prefix = common_prefix + dataset_prefix log_add( dataset_access_rights=access_rights, s3_bucket=bucket, s3_prefix=prefix, ) session = boto3.Session() s3 = session.client("s3") resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) signed_urls = [{ "key": obj["Key"], "url": s3.generate_presigned_url( "get_object", Params={ "Bucket": bucket, "Key": obj["Key"] }, ExpiresIn=60 * 5, ), } for obj in resp["Contents"]] return signed_urls
def generate_signed_url(event, context): dataset_id, version_id, edition_id = _dataset_components_from_event(event) client = APIClient.with_access_token_from_event(event) if not client: return error_response(403, "Forbidden") try: dataset = client.get_dataset(dataset_id) edition = client.get_edition(dataset_id, version_id, edition_id) log_add(dataset=dataset) except requests.HTTPError as e: log_exception(e) return error_response(e.response.status_code, e.response.json()) except Exception as e: log_exception(e) return error_response(500, "Could not complete request, please try again later") if not client.has_distributions(edition): return error_response(404, f"Missing data for {edition['Id']}") # Only users with read access download non-public datasets. if ( dataset["accessRights"] != "public" and ENABLE_AUTH and not resource_authorizer.has_access( client.access_token, scope="okdata:dataset:read", resource_name=f"okdata:dataset:{dataset_id}", ) ): log_add(has_access=False) return error_response(403, "Forbidden") signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset) return response(200, json.dumps(signed_urls))
def get_dtype(schema, input): """ Try to resolve the dtype for the columns for reading csv file Resolve dtype from the taskConfig.TASK_NAME.schema, if this is not available we read the first line (column headers) for the file that is about to be read in by pandas, and set each column to be of type object (default) """ log_add(dtype_source="jsonschema") dtype = Exporter.jsonschema_to_dtypes(schema) if dtype is None: log_add(dtype_source=f"input:{input}") dtype = Exporter.get_dtype_from_input(input) log_add(dtype=dtype) return dtype
def _with_error(config: Config, errors): log_add(errors=errors) log_add(status=Status.VALIDATION_FAILED.value) config.payload.step_data.status = Status.VALIDATION_FAILED.value config.payload.step_data.errors = errors[:100] return asdict(config.payload.step_data)
def _dataset_components_from_event(event): pp = event["pathParameters"] dataset_id, version_id, edition_id = pp["dataset"], pp["version"], pp["edition"] log_add(dataset_id=dataset_id, version_id=version_id, edition_id=edition_id) return dataset_id, version_id, edition_id
def validate_schema_version(self, schema): schema_version = schema["$schema"] log_add(schema_version=schema_version) if schema_version not in SCHEMA_SUPPORTED_VERSIONS: raise ValueError( f"Schema version: {schema_version} is not supported")
def __init__(self, event): self.s3 = boto3.client("s3") self.s3fs_prefix = f"s3://{BUCKET}/" self.config = Config.from_lambda_event(event) self.task_config = TaskConfig.from_config(self.config) log_add(input_config=asdict(self.task_config))
def read_root(): log_add(hello="world") return {"hello": "world"}
def read_error(): log_add(hello="error") raise Exception("This is wrong!")
def is_latest_edition(dataset_id, version, edition): latest_edition = dataset_client.get_latest_edition(dataset_id, version) is_latest = [dataset_id, version, edition] == latest_edition["Id"].split("/") log_add(is_latest_edition=is_latest) return is_latest
def write_s3(event, context): config = Config.from_lambda_event(event) task_config = TaskConfig.from_dict(config.task_config) output_dataset = config.payload.output_dataset step_data = config.payload.step_data content_type = task_config.content_type log_add( dataset_id=output_dataset.id, version=output_dataset.version, edition_id=output_dataset.edition, source_prefixes=step_data.s3_input_prefixes, write_to_latest=task_config.write_to_latest, output_stage=task_config.output_stage, ) if content_type: log_add(content_type=content_type) status_add( domain="dataset", domain_id=f"{output_dataset.id}/{output_dataset.version}", operation=config.task, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) source_prefix = next(iter(step_data.s3_input_prefixes.values())) output_prefix = config.payload.output_dataset.s3_prefix.replace( "%stage%", task_config.output_stage) s3_sources = s3_service.resolve_s3_sources(source_prefix) copied_files = copy_data(s3_sources, output_prefix) if task_config.output_stage == "processed": try: create_distribution_with_retries(output_dataset, copied_files, content_type) except Exception as e: s3_service.delete_from_prefix(output_prefix) log_exception(e) raise DistributionNotCreated if task_config.write_to_latest and is_latest_edition( output_dataset.id, output_dataset.version, output_dataset.edition): write_data_to_latest(s3_sources, output_prefix) output_prefixes = {output_dataset.id: output_prefix} response = StepData(s3_input_prefixes=output_prefixes, status="OK", errors=[]) # TODO: this is just to verify that we have a correct implementation of the status API # temporary - if we are in /latest write -> set run to complete # Once we get this up and see what the status-api can return to the CLI we will update with more information status_body = { "files": [s3_source.key for s3_source in s3_sources], "latest": task_config.write_to_latest, } status_add(status_body=status_body) return asdict(response)
def __init__(self): self.client = boto3.client("s3") log_add(s3_bucket=self.bucket)