Python count_rows_in_delimited_file 예제들, usaspending_api.common.csv_helpers.count_rows_in_delimited_file Python 예제들

예제 #1

0

파일 보기

파일: es_etl_helpers.py 프로젝트: jbuendiallc/usaspending-api

def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose):

    # Execute Copy SQL to download records to CSV
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen("psql {} -c {}".format(get_database_dsn_string(),
                                            copy_sql),
                     shell=True).wait()
    download_count = count_rows_in_delimited_file(filename,
                                                  has_header=True,
                                                  safe=False)
    printf({
        "msg": "Wrote {} to this file: {}".format(download_count, filename),
        "job": job_id,
        "f": "Download"
    })

    # If --skip_counts is disabled, execute count_sql and compare this count to the download_count
    if not skip_counts:
        sql_count = execute_sql_statement(count_sql, True, verbose)[0]["count"]
        if sql_count != download_count:
            msg = "Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}"
            printf({
                "msg": msg.format(sql_count, download_count, filename),
                "job": job_id,
                "f": "Download"
            })
            raise SystemExit(1)
    else:
        printf({
            "msg": "Skipping count comparison checks (sql vs download)",
            "job": job_id,
            "f": "Download"
        })

    return download_count

예제 #2

0

파일 보기

파일: es_etl_helpers.py 프로젝트: g4brielvs/usaspending-api

def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose):

    # Execute Copy SQL to download records to CSV
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen(f"psql {get_database_dsn_string()} -c {copy_sql}",
                     shell=True).wait()
    download_count = count_rows_in_delimited_file(filename,
                                                  has_header=True,
                                                  safe=False)
    logger.info(
        format_log(f"Wrote {download_count:,} to this file: {filename}",
                   job=job_id,
                   process="Download"))

    # If --skip_counts is disabled, execute count_sql and compare this count to the download_count
    if not skip_counts:
        sql_count = execute_sql_statement(count_sql, True, verbose)[0]["count"]
        if sql_count != download_count:
            msg = f'Mismatch between CSV "{filename}" and DB!!! Expected: {sql_count:,} | Actual: {download_count:,}'
            logger.error(format_log(msg, job=job_id, process="Download"))
            raise SystemExit(1)
    else:
        logger.info(
            format_log(f"Skipping count comparison checks (sql vs download)",
                       job=job_id,
                       process="Download"))

    return download_count

예제 #3

0

파일 보기

파일: populate_monthly_delta_files.py 프로젝트: lenjonemcse/usaspending-api

    def create_local_file(self, award_type, source, agency_code, generate_since):
        """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """
        logger.info("Generating CSV file with creations and modifications")

        # Create file paths and working directory
        timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f")
        working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/"
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        agency_str = "All" if agency_code == "all" else agency_code
        source_name = f"FY(All)_{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}"
        source_path = os.path.join(working_dir, "{}.csv".format(source_name))

        # Create a unique temporary file with the raw query
        raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None))  # None requests all headers

        # The raw query is a union of two other queries, each in parentheses. To do replacement we need to split out
        # each query, apply annotations to each of those, then recombine in a UNION
        csv_query_annotated = (
            "("
            + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[0].strip()[1:-1], source.human_names)
            + ") UNION ("
            + apply_annotations_to_sql(_top_level_split(raw_quoted_query, "UNION")[1].strip()[1:-1], source.human_names)
            + ")"
        )

        (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix="bd_sql_", dir="/tmp")
        with open(temp_sql_file_path, "w") as file:
            file.write("\\copy ({}) To STDOUT with CSV HEADER".format(csv_query_annotated))

        logger.info("Generated temp SQL file {}".format(temp_sql_file_path))
        # Generate the csv with \copy
        cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE)
        try:
            subprocess.check_output(
                ["psql", "-o", source_path, os.environ["DOWNLOAD_DATABASE_URL"], "-v", "ON_ERROR_STOP=1"],
                stdin=cat_command.stdout,
                stderr=subprocess.STDOUT,
            )
        except subprocess.CalledProcessError as e:
            logger.exception(e.output)
            raise e

        # Append deleted rows to the end of the file
        if not self.debugging_skip_deleted:
            self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since)
        if count_rows_in_delimited_file(source_path, has_header=True, safe=True) > 0:
            # Split the CSV into multiple files and zip it up
            zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name)

            logger.info("Creating compressed file: {}".format(os.path.basename(zipfile_path)))
            split_and_zip_data_files(zipfile_path, source_path, source_name, "csv")
        else:
            zipfile_path = None

        os.close(temp_sql_file)
        os.remove(temp_sql_file_path)
        shutil.rmtree(working_dir)

        return zipfile_path

예제 #4

0

파일 보기

파일: es_etl_helpers.py 프로젝트: theMapManDebug/usaspending-api

def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose):
    if skip_counts:
        count = None
        printf({
            "msg":
            "Skipping count checks. Writing file: {}".format(filename),
            "job":
            job_id,
            "f":
            "Download"
        })
    else:
        count = execute_sql_statement(count_sql, True, verbose)[0]["count"]
        printf({
            "msg": "Writing {} to this file: {}".format(count, filename),
            "job": job_id,
            "f": "Download"
        })
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen('psql "${{DATABASE_URL}}" -c {}'.format(copy_sql),
                     shell=True).wait()

    if not skip_counts:
        download_count = count_rows_in_delimited_file(filename,
                                                      has_header=True,
                                                      safe=False)
        if count != download_count:
            msg = "Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}"
            printf({
                "msg": msg.format(count, download_count, filename),
                "job": job_id,
                "f": "Download"
            })
            raise SystemExit(1)
    return count

예제 #5

0

파일 보기

파일: generate_covid19_download.py 프로젝트: lenjonemcse/usaspending-api

    def download_to_csv(self, sql_filepath, destination_path,
                        intermediate_data_filename):
        start_time = time.perf_counter()
        logger.info(f"Downloading data to {destination_path}")
        options = FILE_FORMATS[self.file_format]["options"]
        export_query = r"\COPY ({}) TO STDOUT {}".format(
            read_sql_file(sql_filepath), options)
        try:
            temp_file, temp_file_path = generate_export_query_temp_file(
                export_query, None, self.working_dir_path)
            # Create a separate process to run the PSQL command; wait
            psql_process = multiprocessing.Process(
                target=execute_psql,
                args=(temp_file_path, intermediate_data_filename, None))
            psql_process.start()
            wait_for_process(psql_process, start_time, None)

            delim = FILE_FORMATS[self.file_format]["delimiter"]

            # Log how many rows we have
            logger.info(
                f"Counting rows in delimited text file {intermediate_data_filename}"
            )
            try:
                count = count_rows_in_delimited_file(
                    filename=intermediate_data_filename,
                    has_header=True,
                    delimiter=delim)
                logger.info(
                    f"{destination_path} contains {count:,} rows of data")
                self.total_download_count += count
            except Exception:
                logger.exception(
                    "Unable to obtain delimited text file line count")

            start_time = time.perf_counter()
            zip_process = multiprocessing.Process(
                target=split_and_zip_data_files,
                args=(
                    str(self.zip_file_path),
                    intermediate_data_filename,
                    str(destination_path),
                    self.file_format,
                    None,
                ),
            )
            zip_process.start()
            wait_for_process(zip_process, start_time, None)
        except Exception as e:
            raise e
        finally:
            Path(temp_file_path).unlink()
        return destination_path, count

예제 #6

0

파일 보기

파일: download_generation.py 프로젝트: teamj135/usaspending-api

def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format):
    """Write to delimited text file(s) and zip file(s) using the source data"""

    data_file_name = build_data_file_name(source, download_job, piid, assistance_id)

    source_query = source.row_emitter(columns)
    extension = FILE_FORMATS[file_format]["extension"]
    source.file_name = f"{data_file_name}.{extension}"
    source_path = os.path.join(working_dir, source.file_name)

    write_to_log(message=f"Preparing to download data as {source.file_name}", download_job=download_job)

    # Generate the query file; values, limits, dates fixed
    export_query = generate_export_query(source_query, limit, source, columns, file_format)
    temp_file, temp_file_path = generate_export_query_temp_file(export_query, download_job)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql, args=(temp_file_path, source_path, download_job))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job)

        delim = FILE_FORMATS[file_format]["delimiter"]

        # Log how many rows we have
        write_to_log(message="Counting rows in delimited text file", download_job=download_job)
        try:
            download_job.number_of_rows += count_rows_in_delimited_file(
                filename=source_path, has_header=True, delimiter=delim
            )
        except Exception:
            write_to_log(
                message="Unable to obtain delimited text file line count", is_error=True, download_job=download_job
            )
        download_job.save()

        # Create a separate process to split the large data files into smaller file and write to zip; wait
        zip_process = multiprocessing.Process(
            target=split_and_zip_data_files,
            args=(zip_file_path, source_path, data_file_name, file_format, download_job),
        )
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)

예제 #7

0

파일 보기

파일: download_generation.py 프로젝트: theMapManDebug/usaspending-api

def parse_source(source, columns, download_job, working_dir, piid,
                 assistance_id, zip_file_path, limit, extension):
    """Write to delimited text file(s) and zip file(s) using the source data"""
    d_map = {
        "d1": "Contracts",
        "d2": "Assistance",
        "treasury_account": "TAS",
        "federal_account": "FA",
    }
    if download_job and download_job.monthly_download:
        # For monthly archives, use the existing detailed zip filename for the data files
        # e.g. FY(All)-012_Contracts_Delta_20191108.zip -> FY(All)-012_Contracts_Delta_20191108_%.csv
        source_name = strip_file_extension(download_job.file_name)
    elif source.is_for_idv or source.is_for_contract:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(
            piid=slugify_text_for_file_names(piid, "UNKNOWN", 50))
    elif source.is_for_assistance:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(
            assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN",
                                                      50))
    else:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]

        if source.agency_code == "all":
            agency = "All"
        else:
            agency = str(source.agency_code)

        request = json.loads(download_job.json_request)
        filters = request["filters"]
        if request.get("limit"):
            agency = ""
        elif source.file_type not in ("treasury_account", "federal_account"):
            agency = f"{agency}_"
        timestamp = datetime.strftime(datetime.now(timezone.utc),
                                      "%Y-%m-%d_H%HM%MS%S")
        source_name = file_name_pattern.format(
            agency=agency,
            data_quarters=construct_data_date_range(filters),
            level=d_map[source.file_type],
            timestamp=timestamp,
            type=d_map[source.file_type],
        )

    source_query = source.row_emitter(columns)
    source.file_name = f"{source_name}.{extension}"
    source_path = os.path.join(working_dir, source.file_name)

    write_to_log(message=f"Preparing to download data as {source.file_name}",
                 download_job=download_job)

    # Generate the query file; values, limits, dates fixed
    temp_file, temp_file_path = generate_temp_query_file(
        source_query, limit, source, download_job, columns, extension)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql,
                                               args=(temp_file_path,
                                                     source_path,
                                                     download_job))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job)

        delim = FILE_FORMATS[extension]["delimiter"]

        # Log how many rows we have
        write_to_log(message="Counting rows in delimited text file",
                     download_job=download_job)
        try:
            download_job.number_of_rows += count_rows_in_delimited_file(
                filename=source_path, has_header=True, delimiter=delim)
        except Exception:
            write_to_log(
                message="Unable to obtain delimited text file line count",
                is_error=True,
                download_job=download_job)
        download_job.save()

        # Create a separate process to split the large data files into smaller file and write to zip; wait
        zip_process = multiprocessing.Process(target=split_and_zip_data_files,
                                              args=(zip_file_path, source_path,
                                                    source_name, extension,
                                                    download_job))
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)