def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose): # Execute Copy SQL to download records to CSV # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low subprocess.Popen("psql {} -c {}".format(get_database_dsn_string(), copy_sql), shell=True).wait() download_count = count_rows_in_delimited_file(filename, has_header=True, safe=False) printf({ "msg": "Wrote {} to this file: {}".format(download_count, filename), "job": job_id, "f": "Download" }) # If --skip_counts is disabled, execute count_sql and compare this count to the download_count if not skip_counts: sql_count = execute_sql_statement(count_sql, True, verbose)[0]["count"] if sql_count != download_count: msg = "Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}" printf({ "msg": msg.format(sql_count, download_count, filename), "job": job_id, "f": "Download" }) raise SystemExit(1) else: printf({ "msg": "Skipping count comparison checks (sql vs download)", "job": job_id, "f": "Download" }) return download_count
def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose): # Execute Copy SQL to download records to CSV # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low subprocess.Popen(f"psql {get_database_dsn_string()} -c {copy_sql}", shell=True).wait() download_count = count_rows_in_delimited_file(filename, has_header=True, safe=False) logger.info( format_log(f"Wrote {download_count:,} to this file: {filename}", job=job_id, process="Download")) # If --skip_counts is disabled, execute count_sql and compare this count to the download_count if not skip_counts: sql_count = execute_sql_statement(count_sql, True, verbose)[0]["count"] if sql_count != download_count: msg = f'Mismatch between CSV "{filename}" and DB!!! Expected: {sql_count:,} | Actual: {download_count:,}' logger.error(format_log(msg, job=job_id, process="Download")) raise SystemExit(1) else: logger.info( format_log(f"Skipping count comparison checks (sql vs download)", job=job_id, process="Download")) return download_count
def create_local_file(self, award_type, source, agency_code, generate_since): """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """ logger.info("Generating CSV file with creations and modifications") # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f") working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/" if not os.path.exists(working_dir): os.mkdir(working_dir) agency_str = "All" if agency_code == "all" else agency_code source_name = f"FY(All)_{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None)) # None requests all headers # The raw query is a union of two other queries, each in parentheses. To do replacement we need to split out # each query, apply annotations to each of those, then recombine in a UNION csv_query_annotated = ( "(" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[0].strip()[1:-1], source.human_names) + ") UNION (" + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[1].strip()[1:-1], source.human_names) + ")" ) (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp") with open(temp_sql_file_path, "w") as file: file.write("\\copy ({}) To STDOUT with CSV HEADER".format(csv_query_annotated)) logger.info("Generated temp SQL file {}".format(temp_sql_file_path)) # Generate the csv with \copy cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) try: subprocess.check_output( ["psql", "-o", source_path, os.environ["DOWNLOAD_DATABASE_URL"], "-v", "ON_ERROR_STOP=1"], stdin=cat_command.stdout, stderr=subprocess.STDOUT, ) except subprocess.CalledProcessError as e: logger.exception(e.output) raise e # Append deleted rows to the end of the file if not self.debugging_skip_deleted: self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since) if count_rows_in_delimited_file(source_path, has_header=True, safe=True) > 0: # Split the CSV into multiple files and zip it up zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name) logger.info("Creating compressed file: {}".format(os.path.basename(zipfile_path))) split_and_zip_data_files(zipfile_path, source_path, source_name, "csv") else: zipfile_path = None os.close(temp_sql_file) os.remove(temp_sql_file_path) shutil.rmtree(working_dir) return zipfile_path
def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose): if skip_counts: count = None printf({ "msg": "Skipping count checks. Writing file: {}".format(filename), "job": job_id, "f": "Download" }) else: count = execute_sql_statement(count_sql, True, verbose)[0]["count"] printf({ "msg": "Writing {} to this file: {}".format(count, filename), "job": job_id, "f": "Download" }) # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low subprocess.Popen('psql "${{DATABASE_URL}}" -c {}'.format(copy_sql), shell=True).wait() if not skip_counts: download_count = count_rows_in_delimited_file(filename, has_header=True, safe=False) if count != download_count: msg = "Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}" printf({ "msg": msg.format(count, download_count, filename), "job": job_id, "f": "Download" }) raise SystemExit(1) return count
def download_to_csv(self, sql_filepath, destination_path, intermediate_data_filename): start_time = time.perf_counter() logger.info(f"Downloading data to {destination_path}") options = FILE_FORMATS[self.file_format]["options"] export_query = r"\COPY ({}) TO STDOUT {}".format( read_sql_file(sql_filepath), options) try: temp_file, temp_file_path = generate_export_query_temp_file( export_query, None, self.working_dir_path) # Create a separate process to run the PSQL command; wait psql_process = multiprocessing.Process( target=execute_psql, args=(temp_file_path, intermediate_data_filename, None)) psql_process.start() wait_for_process(psql_process, start_time, None) delim = FILE_FORMATS[self.file_format]["delimiter"] # Log how many rows we have logger.info( f"Counting rows in delimited text file {intermediate_data_filename}" ) try: count = count_rows_in_delimited_file( filename=intermediate_data_filename, has_header=True, delimiter=delim) logger.info( f"{destination_path} contains {count:,} rows of data") self.total_download_count += count except Exception: logger.exception( "Unable to obtain delimited text file line count") start_time = time.perf_counter() zip_process = multiprocessing.Process( target=split_and_zip_data_files, args=( str(self.zip_file_path), intermediate_data_filename, str(destination_path), self.file_format, None, ), ) zip_process.start() wait_for_process(zip_process, start_time, None) except Exception as e: raise e finally: Path(temp_file_path).unlink() return destination_path, count
def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format): """Write to delimited text file(s) and zip file(s) using the source data""" data_file_name = build_data_file_name(source, download_job, piid, assistance_id) source_query = source.row_emitter(columns) extension = FILE_FORMATS[file_format]["extension"] source.file_name = f"{data_file_name}.{extension}" source_path = os.path.join(working_dir, source.file_name) write_to_log(message=f"Preparing to download data as {source.file_name}", download_job=download_job) # Generate the query file; values, limits, dates fixed export_query = generate_export_query(source_query, limit, source, columns, file_format) temp_file, temp_file_path = generate_export_query_temp_file(export_query, download_job) start_time = time.perf_counter() try: # Create a separate process to run the PSQL command; wait psql_process = multiprocessing.Process(target=execute_psql, args=(temp_file_path, source_path, download_job)) psql_process.start() wait_for_process(psql_process, start_time, download_job) delim = FILE_FORMATS[file_format]["delimiter"] # Log how many rows we have write_to_log(message="Counting rows in delimited text file", download_job=download_job) try: download_job.number_of_rows += count_rows_in_delimited_file( filename=source_path, has_header=True, delimiter=delim ) except Exception: write_to_log( message="Unable to obtain delimited text file line count", is_error=True, download_job=download_job ) download_job.save() # Create a separate process to split the large data files into smaller file and write to zip; wait zip_process = multiprocessing.Process( target=split_and_zip_data_files, args=(zip_file_path, source_path, data_file_name, file_format, download_job), ) zip_process.start() wait_for_process(zip_process, start_time, download_job) download_job.save() except Exception as e: raise e finally: # Remove temporary files os.close(temp_file) os.remove(temp_file_path)
def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, extension): """Write to delimited text file(s) and zip file(s) using the source data""" d_map = { "d1": "Contracts", "d2": "Assistance", "treasury_account": "TAS", "federal_account": "FA", } if download_job and download_job.monthly_download: # For monthly archives, use the existing detailed zip filename for the data files # e.g. FY(All)-012_Contracts_Delta_20191108.zip -> FY(All)-012_Contracts_Delta_20191108_%.csv source_name = strip_file_extension(download_job.file_name) elif source.is_for_idv or source.is_for_contract: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] source_name = file_name_pattern.format( piid=slugify_text_for_file_names(piid, "UNKNOWN", 50)) elif source.is_for_assistance: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] source_name = file_name_pattern.format( assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50)) else: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] if source.agency_code == "all": agency = "All" else: agency = str(source.agency_code) request = json.loads(download_job.json_request) filters = request["filters"] if request.get("limit"): agency = "" elif source.file_type not in ("treasury_account", "federal_account"): agency = f"{agency}_" timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") source_name = file_name_pattern.format( agency=agency, data_quarters=construct_data_date_range(filters), level=d_map[source.file_type], timestamp=timestamp, type=d_map[source.file_type], ) source_query = source.row_emitter(columns) source.file_name = f"{source_name}.{extension}" source_path = os.path.join(working_dir, source.file_name) write_to_log(message=f"Preparing to download data as {source.file_name}", download_job=download_job) # Generate the query file; values, limits, dates fixed temp_file, temp_file_path = generate_temp_query_file( source_query, limit, source, download_job, columns, extension) start_time = time.perf_counter() try: # Create a separate process to run the PSQL command; wait psql_process = multiprocessing.Process(target=execute_psql, args=(temp_file_path, source_path, download_job)) psql_process.start() wait_for_process(psql_process, start_time, download_job) delim = FILE_FORMATS[extension]["delimiter"] # Log how many rows we have write_to_log(message="Counting rows in delimited text file", download_job=download_job) try: download_job.number_of_rows += count_rows_in_delimited_file( filename=source_path, has_header=True, delimiter=delim) except Exception: write_to_log( message="Unable to obtain delimited text file line count", is_error=True, download_job=download_job) download_job.save() # Create a separate process to split the large data files into smaller file and write to zip; wait zip_process = multiprocessing.Process(target=split_and_zip_data_files, args=(zip_file_path, source_path, source_name, extension, download_job)) zip_process.start() wait_for_process(zip_process, start_time, download_job) download_job.save() except Exception as e: raise e finally: # Remove temporary files os.close(temp_file) os.remove(temp_file_path)