def generate_download(download_job: DownloadJob, origination: Optional[str] = None): """Create data archive files from the download job object""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get("columns", None) limit = json_request.get("limit", None) piid = json_request.get("piid", None) award_id = json_request.get("award_id") assistance_id = json_request.get("assistance_id") file_format = json_request.get("file_format") file_name = start_download(download_job) working_dir = None try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name if not settings.IS_LOCAL and os.path.exists(zip_file_path): # Clean up a zip file that might exist from a prior attempt at this download os.remove(zip_file_path) working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message=f"Generating {file_name}", download_job=download_job) # Generate sources from the JSON request object sources = get_download_sources(json_request, origination) for source in sources: # Parse and write data to the file; if there are no matching columns for a source then add an empty file source_column_count = len(source.columns(columns)) if source_column_count == 0: create_empty_data_file( source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format ) else: download_job.number_of_columns += source_column_count parse_source( source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format ) include_data_dictionary = json_request.get("include_data_dictionary") if include_data_dictionary: add_data_dictionary_to_zip(working_dir, zip_file_path) include_file_description = json_request.get("include_file_description") if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description(include_file_description["source"], sources) file_description = file_description.replace("[AWARD_ID]", str(award_id)) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description ) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if working_dir and os.path.exists(working_dir): shutil.rmtree(working_dir) _kill_spawned_processes(download_job) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job ) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(zip_file_path): os.remove(zip_file_path) _kill_spawned_processes(download_job) return finish_download(download_job)
def download(self, award_type, agency="all", generate_since=None): """ Create a delta file based on award_type, and agency_code (or all agencies) """ logger.info( "Starting generation. {}, Agency: {}".format(award_type, agency if agency == "all" else agency["name"]) ) award_map = AWARD_MAPPINGS[award_type] # Create Source and update fields to include correction_delete_ind source = DownloadSource( "transaction", award_map["letter_name"].lower(), "transactions", "all" if agency == "all" else agency["toptier_agency_id"], ) source.query_paths.update({"correction_delete_ind": award_map["correction_delete_ind"]}) if award_type == "Contracts": # Add the agency_id column to the mappings source.query_paths.update({"agency_id": "transaction__contract_data__agency_id"}) source.query_paths.move_to_end("agency_id", last=False) source.query_paths.move_to_end("correction_delete_ind", last=False) source.human_names = list(source.query_paths.keys()) # Apply filters to the queryset filters, agency_code = self.parse_filters(award_map["award_types"], agency) source.queryset = VALUE_MAPPINGS["transactions"]["filter_function"](filters) if award_type == "Contracts": source.queryset = source.queryset.annotate( correction_delete_ind=Case( When(transaction__contract_data__created_at__lt=generate_since, then=Value("C")), default=Value(""), output_field=CharField(), ) ) else: indicator_field = F("transaction__assistance_data__correction_delete_indicatr") source.queryset = source.queryset.annotate( correction_delete_ind=Case( When(transaction__assistance_data__updated_at__gt=generate_since, then=indicator_field), When(transaction__transactiondelta__isnull=False, then=Value("C")), default=indicator_field, output_field=CharField(), ) ) transaction_delta_queryset = source.queryset _filter = {"transaction__{}__{}__gte".format(award_map["model"], award_map["date_filter"]): generate_since} if self.debugging_end_date: _filter[ "transaction__{}__{}__lt".format(award_map["model"], award_map["date_filter"]) ] = self.debugging_end_date source.queryset = source.queryset.filter(**_filter) # UNION the normal results to the transaction_delta results. source.queryset = source.queryset.union( transaction_delta_queryset.filter(transaction__transactiondelta__isnull=False) ) # Generate file file_path = self.create_local_file(award_type, source, agency_code, generate_since) if file_path is None: logger.info("No new, modified, or deleted data; discarding file") elif not settings.IS_LOCAL: # Upload file to S3 and delete local version logger.info("Uploading file to S3 bucket and deleting local copy") multipart_upload( settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME, settings.USASPENDING_AWS_REGION, file_path, os.path.basename(file_path), ) os.remove(file_path) logger.info( "Finished generation. {}, Agency: {}".format(award_type, agency if agency == "all" else agency["name"]) )
def upload_placeholder(self, file_name, empty_file): bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION logger.info("Uploading {}".format(file_name)) multipart_upload(bucket, region, empty_file, file_name)
def generate_download(download_job: DownloadJob, origination: Optional[str] = None): """Create data archive files from the download job object""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get("columns", None) limit = json_request.get("limit", None) piid = json_request.get("piid", None) award_id = json_request.get("award_id") assistance_id = json_request.get("assistance_id") file_format = json_request.get("file_format") request_type = json_request.get("request_type") span = tracer.current_span() if span and request_type: span.resource = request_type file_name = start_download(download_job) working_dir = None try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name if not settings.IS_LOCAL and os.path.exists(zip_file_path): # Clean up a zip file that might exist from a prior attempt at this download os.remove(zip_file_path) working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message=f"Generating {file_name}", download_job=download_job) # Generate sources from the JSON request object sources = get_download_sources(json_request, origination) for source in sources: # Parse and write data to the file; if there are no matching columns for a source then add an empty file source_column_count = len(source.columns(columns)) if source_column_count == 0: create_empty_data_file(source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format) else: download_job.number_of_columns += source_column_count parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format) include_data_dictionary = json_request.get("include_data_dictionary") if include_data_dictionary: add_data_dictionary_to_zip(working_dir, zip_file_path) include_file_description = json_request.get("include_file_description") if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description( include_file_description["source"], sources) file_description = file_description.replace( "[AWARD_ID]", str(award_id)) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if working_dir and os.path.exists(working_dir): shutil.rmtree(working_dir) _kill_spawned_processes(download_job) # push file to S3 bucket, if not local if not settings.IS_LOCAL: with tracer.trace( name=f"job.{JOB_TYPE}.download.s3", service="bulk-download", resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", span_type=SpanTypes.WORKER, ) as span, tracer.trace( name="s3.command", service="aws.s3", resource=".".join([ multipart_upload.__module__, (multipart_upload.__qualname__ or multipart_upload.__name__) ]), span_type=SpanTypes.WEB, ) as s3_span: # NOTE: Traces still not auto-picking-up aws.s3 service upload activity # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach span.set_tag("file_name", file_name) try: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION s3_span.set_tags({ "bucket": bucket, "region": region, "file": zip_file_path }) start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( message= f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if os.path.exists(zip_file_path): os.remove(zip_file_path) _kill_spawned_processes(download_job) return finish_download(download_job)