Python log_add示例，okdata.aws.logging.log_add Python示例

示例#1

0

显示文件

 def _get_metadata(self, url):
     response = self._get(url)
     if response.status_code != 200:
         log_add(metadata_error_code=response.status_code, metadata_url=url)
         response.raise_for_status()
     data = response.json()
     return data

示例#2

0

显示文件

文件： handler.py 项目： oslokommune/okdata-pipeline

def write_to_kinesis(events, stream_name):
    records = [{
        "Data": json.dumps(event) + "\n",
        "PartitionKey": str(uuid.uuid4())
    } for event in events]
    log_add(num_records=len(records))
    kinesis_client.put_records(StreamName=stream_name, Records=records)

示例#3

0

显示文件

def say_hello(event, context):
    log_add(relevant_information="Hello from Python blueprint!")

    return {
        "statusCode": 200,
        "headers": {},
        "body": json.dumps({"hello": "world!"}),
    }

示例#4

0

显示文件

文件： jsonschema_validator.py 项目： oslokommune/okdata-pipeline

    def validate(self, data):
        raw_errors = self.validator.iter_errors(data)
        log_add(raw_errors=raw_errors)
        errors = []
        for e in raw_errors:
            error = {"message": e.message, "row": "root"}
            path_len = len(e.path)
            if path_len > 0:
                error["row"] = e.path[0]
                if path_len > 1:
                    error["col"] = e.path[1]
            errors.append(error)

        return errors

示例#5

0

显示文件

    def resolve_s3_sources(self, source_prefix: str):
        source_objects = self.list_objects_contents(source_prefix)
        log_add(num_source_objects=len(source_objects))
        if not source_objects:
            raise Exception(f"No source files found at: {source_prefix}")

        s3_sources = []

        for obj in source_objects:
            source_key = obj["Key"]
            filename = source_key.split("/")[-1]
            s3_sources.append(S3Source(filename=filename, key=source_key))

        return s3_sources

示例#6

0

显示文件

文件： lambda_invoker.py 项目： oslokommune/okdata-pipeline

def invoke_lambda(event, context):
    config = Config.from_lambda_event(event)
    function_arn = config.payload.pipeline.task_config.get(
        config.task).get("arn")

    log_add(function_arn=function_arn)
    log_add(event=event)

    response = lambda_client.invoke(
        FunctionName=function_arn,
        Payload=json.dumps(event),
        InvocationType="RequestResponse",
    )
    result = read_result(response)
    return result

示例#7

0

显示文件

def validate_json(event, context):
    config = Config.from_lambda_event(event)
    step_config = StepConfig.from_dict(config.task_config)
    step_data = config.payload.step_data

    log_add(
        dataset_id=config.payload.output_dataset.id,
        version=config.payload.output_dataset.version,
        edition=config.payload.output_dataset.edition,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    if step_config.schema is None:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_SUCCESS",
                errors=[],
            ))

    input_data = resolve_input_data(step_data)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate_list(input_data)

    if validation_errors:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_FAILED",
                errors=validation_errors[:100],
            ))

    return asdict(
        StepData(
            input_events=step_data.input_events,
            s3_input_prefixes=step_data.s3_input_prefixes,
            status="VALIDATION_SUCCESS",
            errors=[],
        ))

示例#8

0

显示文件

文件： handler.py 项目： oslokommune/okdata-pipeline

def write_kinesis(event, context):
    pipeline_config = Config.from_lambda_event(event)

    dataset_id = pipeline_config.payload.output_dataset.id
    version = pipeline_config.payload.output_dataset.version
    log_add(dataset_id=dataset_id, version=version)

    dataset = dataset_client.get_dataset(dataset_id, retries=3)
    access_rights = dataset["accessRights"]
    confidentiality = CONFIDENTIALITY_MAP[access_rights]

    output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json"
    log_add(output_stream_name=output_stream_name)

    input_events = pipeline_config.payload.step_data.input_events
    write_to_kinesis(events=input_events, stream_name=output_stream_name)

    return asdict(StepData(input_events=input_events, status="OK", errors=[]))

示例#9

0

显示文件

    def delete_from_prefix(self, s3_prefix):
        objects_to_delete = [{
            "Key": obj["Key"]
        } for obj in self.list_objects_contents(s3_prefix)]

        if not objects_to_delete:
            return

        self.client.delete_objects(
            Bucket=self.bucket,
            Delete={
                "Objects": [{
                    "Key": s3_object["Key"]
                } for s3_object in objects_to_delete],
                "Quiet":
                True,
            },
        )
        log_add(deleted_from_s3_path=objects_to_delete)

示例#10

0

显示文件

文件： base.py 项目： oslokommune/okdata-pipeline

 def read_csv(self):
     s3_objects = self._list_s3_objects()
     schema = self.task_config.schema
     log_add(schema=schema)
     log_add(s3_keys=[obj["Key"] for obj in s3_objects])
     files = []
     for s3_object in s3_objects:
         key = self.s3fs_prefix + s3_object["Key"]
         dtype = Exporter.get_dtype(schema, key)
         df = self._read_csv_data(
             key,
             delimiter=self.task_config.delimiter,
             chunksize=self.task_config.chunksize,
             dtype=dtype,
         )
         filename = key.split("/")[-1]
         filename = Exporter.remove_suffix(filename)
         files.append((filename, df))
     return files

示例#11

0

显示文件

文件： validator.py 项目： oslokommune/okdata-pipeline

def validate_csv(event, context):
    config = Config.from_lambda_event(event)

    step_config = StepConfig.from_task_config(config.task_config)

    s3_prefix = config.payload.output_dataset.s3_prefix

    log_add(
        header_row=step_config.header_row,
        delimiter=step_config.delimiter,
        quote=step_config.quote,
        schema=step_config.schema,
        output_prefix=s3_prefix,
    )

    if not step_config.schema:
        log_add(notice="No Schema provided for validation")
        config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
        # 2020.06: Validation done optionally - we now return ok if we don't supply a
        # schema for the validation step
        return asdict(config.payload.step_data)

    input_prefix = next(
        iter(config.payload.step_data.s3_input_prefixes.values()))
    log_add(s3_input_prefix=input_prefix)
    objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    s3_path = next(iter(objects["Contents"]))["Key"]
    log_add(s3_input_path=s3_path)

    response = s3.get_object(Bucket=BUCKET, Key=s3_path)
    reader = csv.reader(
        string_reader.from_response(response),
        dialect="unix",
        delimiter=step_config.delimiter,
        quotechar=step_config.quote,
    )
    header = None
    if step_config.header_row:
        header = next(reader)
    try:
        csv_data = parse_csv(reader, step_config.schema, header)
    except ParseErrors as p:
        return _with_error(config, p.errors)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate(csv_data)

    if validation_errors:
        return _with_error(config, errors=validation_errors)

    config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
    return asdict(config.payload.step_data)

示例#12

0

显示文件

    def export(self):
        inputs = self.read_csv()
        s3_prefix = self.s3_prefix()
        outputs = []
        schema = self.task_config.schema
        errors = []
        try:
            for filename, source in inputs:
                out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}"
                if self.task_config.chunksize:
                    outputs.extend(
                        self._parallel_export(filename, source, schema,
                                              out_prefix))
                else:
                    outputs.append(self._export(source, schema, out_prefix))
        except OutOfBoundsDatetime as e:
            errors.append({"error": "OutOfBoundsDatetime", "message": str(e)})
        except ValueError as e:
            errors.append({"error": "ValueError", "message": str(e)})

        if len(errors) > 0:
            log_add(errors=errors)
            return asdict(
                StepData(
                    status="CONVERSION_FAILED",
                    errors=errors,
                    s3_input_prefixes={
                        self.config.payload.output_dataset.id: s3_prefix
                    },
                ))

        log_add(parquetfiles=outputs)
        return asdict(
            StepData(
                status="CONVERSION_SUCCESS",
                errors=[],
                s3_input_prefixes={
                    self.config.payload.output_dataset.id: s3_prefix
                },
            ))

示例#13

0

显示文件

def generate_signed_url_public(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient()

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    if dataset["accessRights"] != "public":
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))

示例#14

0

显示文件

def generate_signed_urls(bucket, dataset, edition):
    access_rights = dataset["accessRights"]
    dataset_id, version, edition_id = edition["Id"].split("/")
    confidentiality = CONFIDENTIALITY_MAP[access_rights]
    common_prefix = f"processed/{confidentiality}/"
    parent_id = dataset.get("parent_id")

    dataset_prefix = f"{dataset_id}/version={version}/edition={edition_id}/"
    if parent_id:
        dataset_prefix = f"{parent_id}/{dataset_prefix}"

    prefix = common_prefix + dataset_prefix

    log_add(
        dataset_access_rights=access_rights,
        s3_bucket=bucket,
        s3_prefix=prefix,
    )

    session = boto3.Session()
    s3 = session.client("s3")
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    signed_urls = [{
        "key":
        obj["Key"],
        "url":
        s3.generate_presigned_url(
            "get_object",
            Params={
                "Bucket": bucket,
                "Key": obj["Key"]
            },
            ExpiresIn=60 * 5,
        ),
    } for obj in resp["Contents"]]
    return signed_urls

示例#15

0

显示文件

def generate_signed_url(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient.with_access_token_from_event(event)

    if not client:
        return error_response(403, "Forbidden")

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    # Only users with read access download non-public datasets.
    if (
        dataset["accessRights"] != "public"
        and ENABLE_AUTH
        and not resource_authorizer.has_access(
            client.access_token,
            scope="okdata:dataset:read",
            resource_name=f"okdata:dataset:{dataset_id}",
        )
    ):
        log_add(has_access=False)
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))

示例#16

0

显示文件

文件： base.py 项目： oslokommune/okdata-pipeline

 def get_dtype(schema, input):
     """
     Try to resolve the dtype for the columns for reading csv file
     Resolve dtype from the taskConfig.TASK_NAME.schema, if this is
     not available we read the first line (column headers) for the file that is about
     to be read in by pandas, and set each column to be of type object (default)
     """
     log_add(dtype_source="jsonschema")
     dtype = Exporter.jsonschema_to_dtypes(schema)
     if dtype is None:
         log_add(dtype_source=f"input:{input}")
         dtype = Exporter.get_dtype_from_input(input)
     log_add(dtype=dtype)
     return dtype

示例#17

0

显示文件

文件： validator.py 项目： oslokommune/okdata-pipeline

def _with_error(config: Config, errors):
    log_add(errors=errors)
    log_add(status=Status.VALIDATION_FAILED.value)
    config.payload.step_data.status = Status.VALIDATION_FAILED.value
    config.payload.step_data.errors = errors[:100]
    return asdict(config.payload.step_data)

示例#18

0

显示文件

def _dataset_components_from_event(event):
    pp = event["pathParameters"]
    dataset_id, version_id, edition_id = pp["dataset"], pp["version"], pp["edition"]
    log_add(dataset_id=dataset_id, version_id=version_id, edition_id=edition_id)
    return dataset_id, version_id, edition_id

示例#19

0

显示文件

文件： jsonschema_validator.py 项目： oslokommune/okdata-pipeline

 def validate_schema_version(self, schema):
     schema_version = schema["$schema"]
     log_add(schema_version=schema_version)
     if schema_version not in SCHEMA_SUPPORTED_VERSIONS:
         raise ValueError(
             f"Schema version: {schema_version} is not supported")

示例#20

0

显示文件

文件： base.py 项目： oslokommune/okdata-pipeline

 def __init__(self, event):
     self.s3 = boto3.client("s3")
     self.s3fs_prefix = f"s3://{BUCKET}/"
     self.config = Config.from_lambda_event(event)
     self.task_config = TaskConfig.from_config(self.config)
     log_add(input_config=asdict(self.task_config))

示例#21

0

显示文件

def read_root():
    log_add(hello="world")
    return {"hello": "world"}

示例#22

0

显示文件

def read_error():
    log_add(hello="error")
    raise Exception("This is wrong!")

示例#23

0

显示文件

文件： handlers.py 项目： oslokommune/okdata-pipeline

def is_latest_edition(dataset_id, version, edition):
    latest_edition = dataset_client.get_latest_edition(dataset_id, version)
    is_latest = [dataset_id, version,
                 edition] == latest_edition["Id"].split("/")
    log_add(is_latest_edition=is_latest)
    return is_latest

示例#24

0

显示文件

文件： handlers.py 项目： oslokommune/okdata-pipeline

def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)

示例#25

0

显示文件

    def __init__(self):

        self.client = boto3.client("s3")
        log_add(s3_bucket=self.bucket)