Python multipart_upload示例，usaspending_api.common.helpers.s3_helpers.multipart_upload Python示例

示例#1

0

显示文件

文件： download_generation.py 项目： teamj135/usaspending-api

def generate_download(download_job: DownloadJob, origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}", download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(
                    source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format
                )
            else:
                download_job.number_of_columns += source_column_count
                parse_source(
                    source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format
                )
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(include_file_description["source"], sources)
            file_description = file_description.replace("[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"], file_description
            )
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path))
            write_to_log(
                message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job
            )
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)
        _kill_spawned_processes(download_job)

    return finish_download(download_job)

示例#2

0

显示文件

文件： populate_monthly_delta_files.py 项目： lenjonemcse/usaspending-api

    def download(self, award_type, agency="all", generate_since=None):
        """ Create a delta file based on award_type, and agency_code (or all agencies) """
        logger.info(
            "Starting generation. {}, Agency: {}".format(award_type, agency if agency == "all" else agency["name"])
        )
        award_map = AWARD_MAPPINGS[award_type]

        # Create Source and update fields to include correction_delete_ind
        source = DownloadSource(
            "transaction",
            award_map["letter_name"].lower(),
            "transactions",
            "all" if agency == "all" else agency["toptier_agency_id"],
        )
        source.query_paths.update({"correction_delete_ind": award_map["correction_delete_ind"]})
        if award_type == "Contracts":
            # Add the agency_id column to the mappings
            source.query_paths.update({"agency_id": "transaction__contract_data__agency_id"})
            source.query_paths.move_to_end("agency_id", last=False)
        source.query_paths.move_to_end("correction_delete_ind", last=False)
        source.human_names = list(source.query_paths.keys())

        # Apply filters to the queryset
        filters, agency_code = self.parse_filters(award_map["award_types"], agency)
        source.queryset = VALUE_MAPPINGS["transactions"]["filter_function"](filters)

        if award_type == "Contracts":
            source.queryset = source.queryset.annotate(
                correction_delete_ind=Case(
                    When(transaction__contract_data__created_at__lt=generate_since, then=Value("C")),
                    default=Value(""),
                    output_field=CharField(),
                )
            )
        else:
            indicator_field = F("transaction__assistance_data__correction_delete_indicatr")
            source.queryset = source.queryset.annotate(
                correction_delete_ind=Case(
                    When(transaction__assistance_data__updated_at__gt=generate_since, then=indicator_field),
                    When(transaction__transactiondelta__isnull=False, then=Value("C")),
                    default=indicator_field,
                    output_field=CharField(),
                )
            )

        transaction_delta_queryset = source.queryset

        _filter = {"transaction__{}__{}__gte".format(award_map["model"], award_map["date_filter"]): generate_since}
        if self.debugging_end_date:
            _filter[
                "transaction__{}__{}__lt".format(award_map["model"], award_map["date_filter"])
            ] = self.debugging_end_date

        source.queryset = source.queryset.filter(**_filter)

        # UNION the normal results to the transaction_delta results.
        source.queryset = source.queryset.union(
            transaction_delta_queryset.filter(transaction__transactiondelta__isnull=False)
        )

        # Generate file
        file_path = self.create_local_file(award_type, source, agency_code, generate_since)
        if file_path is None:
            logger.info("No new, modified, or deleted data; discarding file")
        elif not settings.IS_LOCAL:
            # Upload file to S3 and delete local version
            logger.info("Uploading file to S3 bucket and deleting local copy")
            multipart_upload(
                settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
                settings.USASPENDING_AWS_REGION,
                file_path,
                os.path.basename(file_path),
            )
            os.remove(file_path)

        logger.info(
            "Finished generation. {}, Agency: {}".format(award_type, agency if agency == "all" else agency["name"])
        )

示例#3

0

显示文件

文件： populate_monthly_files.py 项目： ericaosta/usaspending-api

    def upload_placeholder(self, file_name, empty_file):
        bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
        region = settings.USASPENDING_AWS_REGION

        logger.info("Uploading {}".format(file_name))
        multipart_upload(bucket, region, empty_file, file_name)

示例#4

0

显示文件

文件： download_generation.py 项目： lenjonemcse/usaspending-api

def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")
    request_type = json_request.get("request_type")

    span = tracer.current_span()
    if span and request_type:
        span.resource = request_type

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    # push file to S3 bucket, if not local
    if not settings.IS_LOCAL:
        with tracer.trace(
                name=f"job.{JOB_TYPE}.download.s3",
                service="bulk-download",
                resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}",
                span_type=SpanTypes.WORKER,
        ) as span, tracer.trace(
                name="s3.command",
                service="aws.s3",
                resource=".".join([
                    multipart_upload.__module__,
                    (multipart_upload.__qualname__
                     or multipart_upload.__name__)
                ]),
                span_type=SpanTypes.WEB,
        ) as s3_span:
            # NOTE: Traces still not auto-picking-up aws.s3 service upload activity
            # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach
            span.set_tag("file_name", file_name)
            try:
                bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
                region = settings.USASPENDING_AWS_REGION
                s3_span.set_tags({
                    "bucket": bucket,
                    "region": region,
                    "file": zip_file_path
                })
                start_uploading = time.perf_counter()
                multipart_upload(bucket, region, zip_file_path,
                                 os.path.basename(zip_file_path))
                write_to_log(
                    message=
                    f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                    download_job=download_job)
            except Exception as e:
                # Set error message; job_status_id will be set in download_sqs_worker.handle()
                exc_msg = "An exception was raised while attempting to upload the file"
                fail_download(download_job, e, exc_msg)
                if isinstance(e, InvalidParameterException):
                    raise InvalidParameterException(e)
                else:
                    raise Exception(download_job.error_message) from e
            finally:
                # Remove generated file
                if os.path.exists(zip_file_path):
                    os.remove(zip_file_path)
                _kill_spawned_processes(download_job)

    return finish_download(download_job)