예제 #1
0
 def with_access_token_from_event(cls, event):
     try:
         auth_type, access_token = event["headers"]["Authorization"].split(
             " ")
         if auth_type.lower() == "bearer":
             return cls(access_token)
         raise ValueError(f'Expected auth type "Bearer", got "{auth_type}"')
     except ValueError as e:
         log_exception(e)
     return None
예제 #2
0
    def copy(self, s3_sources, output_prefix, retries=3):
        failed_s3_sources = []
        for s3_source in s3_sources:
            try:
                self.client.copy_object(
                    Bucket=self.bucket,
                    Key=output_prefix + s3_source.filename,
                    CopySource={
                        "Key": s3_source.key,
                        "Bucket": self.bucket
                    },
                )
            except Exception as e:
                failed_s3_sources.append(s3_source)
                log_exception(e)

        if len(failed_s3_sources) > 0:
            if retries > 0:
                self.copy(failed_s3_sources, output_prefix, retries - 1)
            else:
                raise IncompleteTransaction
예제 #3
0
def generate_signed_url_public(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient()

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    if dataset["accessRights"] != "public":
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))
예제 #4
0
def generate_signed_url(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient.with_access_token_from_event(event)

    if not client:
        return error_response(403, "Forbidden")

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    # Only users with read access download non-public datasets.
    if (
        dataset["accessRights"] != "public"
        and ENABLE_AUTH
        and not resource_authorizer.has_access(
            client.access_token,
            scope="okdata:dataset:read",
            resource_name=f"okdata:dataset:{dataset_id}",
        )
    ):
        log_add(has_access=False)
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))
예제 #5
0
def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)