def with_access_token_from_event(cls, event): try: auth_type, access_token = event["headers"]["Authorization"].split( " ") if auth_type.lower() == "bearer": return cls(access_token) raise ValueError(f'Expected auth type "Bearer", got "{auth_type}"') except ValueError as e: log_exception(e) return None
def copy(self, s3_sources, output_prefix, retries=3): failed_s3_sources = [] for s3_source in s3_sources: try: self.client.copy_object( Bucket=self.bucket, Key=output_prefix + s3_source.filename, CopySource={ "Key": s3_source.key, "Bucket": self.bucket }, ) except Exception as e: failed_s3_sources.append(s3_source) log_exception(e) if len(failed_s3_sources) > 0: if retries > 0: self.copy(failed_s3_sources, output_prefix, retries - 1) else: raise IncompleteTransaction
def generate_signed_url_public(event, context): dataset_id, version_id, edition_id = _dataset_components_from_event(event) client = APIClient() try: dataset = client.get_dataset(dataset_id) edition = client.get_edition(dataset_id, version_id, edition_id) log_add(dataset=dataset) except requests.HTTPError as e: log_exception(e) return error_response(e.response.status_code, e.response.json()) except Exception as e: log_exception(e) return error_response(500, "Could not complete request, please try again later") if not client.has_distributions(edition): return error_response(404, f"Missing data for {edition['Id']}") if dataset["accessRights"] != "public": return error_response(403, "Forbidden") signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset) return response(200, json.dumps(signed_urls))
def generate_signed_url(event, context): dataset_id, version_id, edition_id = _dataset_components_from_event(event) client = APIClient.with_access_token_from_event(event) if not client: return error_response(403, "Forbidden") try: dataset = client.get_dataset(dataset_id) edition = client.get_edition(dataset_id, version_id, edition_id) log_add(dataset=dataset) except requests.HTTPError as e: log_exception(e) return error_response(e.response.status_code, e.response.json()) except Exception as e: log_exception(e) return error_response(500, "Could not complete request, please try again later") if not client.has_distributions(edition): return error_response(404, f"Missing data for {edition['Id']}") # Only users with read access download non-public datasets. if ( dataset["accessRights"] != "public" and ENABLE_AUTH and not resource_authorizer.has_access( client.access_token, scope="okdata:dataset:read", resource_name=f"okdata:dataset:{dataset_id}", ) ): log_add(has_access=False) return error_response(403, "Forbidden") signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset) return response(200, json.dumps(signed_urls))
def write_s3(event, context): config = Config.from_lambda_event(event) task_config = TaskConfig.from_dict(config.task_config) output_dataset = config.payload.output_dataset step_data = config.payload.step_data content_type = task_config.content_type log_add( dataset_id=output_dataset.id, version=output_dataset.version, edition_id=output_dataset.edition, source_prefixes=step_data.s3_input_prefixes, write_to_latest=task_config.write_to_latest, output_stage=task_config.output_stage, ) if content_type: log_add(content_type=content_type) status_add( domain="dataset", domain_id=f"{output_dataset.id}/{output_dataset.version}", operation=config.task, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) source_prefix = next(iter(step_data.s3_input_prefixes.values())) output_prefix = config.payload.output_dataset.s3_prefix.replace( "%stage%", task_config.output_stage) s3_sources = s3_service.resolve_s3_sources(source_prefix) copied_files = copy_data(s3_sources, output_prefix) if task_config.output_stage == "processed": try: create_distribution_with_retries(output_dataset, copied_files, content_type) except Exception as e: s3_service.delete_from_prefix(output_prefix) log_exception(e) raise DistributionNotCreated if task_config.write_to_latest and is_latest_edition( output_dataset.id, output_dataset.version, output_dataset.edition): write_data_to_latest(s3_sources, output_prefix) output_prefixes = {output_dataset.id: output_prefix} response = StepData(s3_input_prefixes=output_prefixes, status="OK", errors=[]) # TODO: this is just to verify that we have a correct implementation of the status API # temporary - if we are in /latest write -> set run to complete # Once we get this up and see what the status-api can return to the CLI we will update with more information status_body = { "files": [s3_source.key for s3_source in s3_sources], "latest": task_config.write_to_latest, } status_add(status_body=status_body) return asdict(response)