def test_save_file_description(): with NamedTemporaryFile() as f: working_dir, file_name = os.path.split(f.name) file_path = save_file_description(working_dir, file_name, "this is a test") assert file_path == f.name assert f.read().decode("utf-8") == "this is a test" with NamedTemporaryFile() as f: working_dir, file_name = os.path.split(f.name) file_path = save_file_description(working_dir, file_name, "this\ris\na\r\ntest") assert file_path == f.name assert f.read().decode("utf-8") == "this\r\nis\r\na\r\ntest"
def finalize_zip_contents(self): self.filepaths_to_delete.append(self.working_dir_path / "Data_Dictionary_Crosswalk.xlsx") add_data_dictionary_to_zip(str(self.zip_file_path.parent), str(self.zip_file_path)) file_description = build_file_description(str(self.readme_path), dict()) file_description_path = save_file_description( str(self.zip_file_path.parent), self.readme_path.name, file_description) self.filepaths_to_delete.append(Path(file_description_path)) append_files_to_zip_file([file_description_path], str(self.zip_file_path)) self.total_download_size = self.zip_file_path.stat().st_size
def generate_download(download_job: DownloadJob, origination: Optional[str] = None): """Create data archive files from the download job object""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get("columns", None) limit = json_request.get("limit", None) piid = json_request.get("piid", None) award_id = json_request.get("award_id") assistance_id = json_request.get("assistance_id") file_format = json_request.get("file_format") file_name = start_download(download_job) working_dir = None try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name if not settings.IS_LOCAL and os.path.exists(zip_file_path): # Clean up a zip file that might exist from a prior attempt at this download os.remove(zip_file_path) working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message=f"Generating {file_name}", download_job=download_job) # Generate sources from the JSON request object sources = get_download_sources(json_request, origination) for source in sources: # Parse and write data to the file; if there are no matching columns for a source then add an empty file source_column_count = len(source.columns(columns)) if source_column_count == 0: create_empty_data_file(source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format) else: download_job.number_of_columns += source_column_count parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format) include_data_dictionary = json_request.get("include_data_dictionary") if include_data_dictionary: add_data_dictionary_to_zip(working_dir, zip_file_path) include_file_description = json_request.get("include_file_description") if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description( include_file_description["source"], sources) file_description = file_description.replace( "[AWARD_ID]", str(award_id)) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if working_dir and os.path.exists(working_dir): shutil.rmtree(working_dir) _kill_spawned_processes(download_job) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( message= f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(zip_file_path): os.remove(zip_file_path) _kill_spawned_processes(download_job) return finish_download(download_job)
def generate_csvs(download_job): """Derive the relevant file location and write CSVs to it""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get('columns', None) limit = json_request.get('limit', None) piid = json_request.get('piid', None) file_name = start_download(download_job) try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message='Generating {}'.format(file_name), download_job=download_job) # Generate sources from the JSON request object sources = get_csv_sources(json_request) for source in sources: # Parse and write data to the file download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns))) parse_source(source, columns, download_job, working_dir, piid, zip_file_path, limit) include_file_description = json_request.get('include_file_description') if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description(include_file_description["source"], sources) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if os.path.exists(working_dir): shutil.rmtree(working_dir) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log(message='Uploading took {} seconds'.format(time.perf_counter() - start_uploading), download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(zip_file_path): os.remove(zip_file_path) return finish_download(download_job)
def generate_download(download_job: DownloadJob, origination: Optional[str] = None): """Create data archive files from the download job object""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get("columns", None) limit = json_request.get("limit", None) piid = json_request.get("piid", None) award_id = json_request.get("award_id") assistance_id = json_request.get("assistance_id") file_format = json_request.get("file_format") request_type = json_request.get("request_type") span = tracer.current_span() if span and request_type: span.resource = request_type file_name = start_download(download_job) working_dir = None try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name if not settings.IS_LOCAL and os.path.exists(zip_file_path): # Clean up a zip file that might exist from a prior attempt at this download os.remove(zip_file_path) working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message=f"Generating {file_name}", download_job=download_job) # Generate sources from the JSON request object sources = get_download_sources(json_request, origination) for source in sources: # Parse and write data to the file; if there are no matching columns for a source then add an empty file source_column_count = len(source.columns(columns)) if source_column_count == 0: create_empty_data_file(source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format) else: download_job.number_of_columns += source_column_count parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format) include_data_dictionary = json_request.get("include_data_dictionary") if include_data_dictionary: add_data_dictionary_to_zip(working_dir, zip_file_path) include_file_description = json_request.get("include_file_description") if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description( include_file_description["source"], sources) file_description = file_description.replace( "[AWARD_ID]", str(award_id)) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if working_dir and os.path.exists(working_dir): shutil.rmtree(working_dir) _kill_spawned_processes(download_job) # push file to S3 bucket, if not local if not settings.IS_LOCAL: with tracer.trace( name=f"job.{JOB_TYPE}.download.s3", service="bulk-download", resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", span_type=SpanTypes.WORKER, ) as span, tracer.trace( name="s3.command", service="aws.s3", resource=".".join([ multipart_upload.__module__, (multipart_upload.__qualname__ or multipart_upload.__name__) ]), span_type=SpanTypes.WEB, ) as s3_span: # NOTE: Traces still not auto-picking-up aws.s3 service upload activity # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach span.set_tag("file_name", file_name) try: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION s3_span.set_tags({ "bucket": bucket, "region": region, "file": zip_file_path }) start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( message= f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if os.path.exists(zip_file_path): os.remove(zip_file_path) _kill_spawned_processes(download_job) return finish_download(download_job)