Пример #1
0
    def prepare_for_etl(self) -> None:
        if self.config["process_deletes"]:
            self.run_deletes()
        logger.info(format_log("Assessing data to process"))
        self.record_count, self.min_id, self.max_id = count_of_records_to_process(
            self.config)

        if self.record_count == 0:
            self.processes = []
            return

        self.config["partitions"] = self.determine_partitions()
        self.config["processes"] = min(self.config["processes"],
                                       self.config["partitions"])
        self.tasks = self.construct_tasks()

        logger.info(
            format_log(
                f"Created {len(self.tasks):,} task partitions"
                f" to process {self.record_count:,} total {self.config['data_type']} records"
                f" from ID {self.min_id} to {self.max_id}"
                f" with {self.config['processes']:,} parallel processes"))

        if self.config["create_new_index"]:
            # ensure template for index is present and the latest version
            call_command("es_configure", "--template-only",
                         f"--load-type={self.config['data_type']}")
            create_index(self.config["index_name"],
                         instantiate_elasticsearch_client())
Пример #2
0
def parse_cli_args(options: dict, es_client) -> dict:
    passthrough_values = (
        "create_new_index",
        "drop_db_view",
        "index_name",
        "load_type",
        "partition_size",
        "process_deletes",
        "processes",
        "skip_counts",
        "skip_delete_index",
    )
    config = set_config(passthrough_values, options)

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: '--create-new-index' requires '--index-name'.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = config["initial_datetime"]
        check_new_index_name_is_ok(config["index_name"],
                                   config["required_index_name"])
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-aware for S3
        config["starting_date"] = get_last_load_date(
            config["stored_date_key"], default=config["initial_datetime"])

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != config["initial_datetime"])

    if config["is_incremental_load"]:
        if config["index_name"]:
            logger.info(
                format_log(
                    f"Ignoring provided index name, using alias '{config['write_alias']}' for safety"
                ))
        config["index_name"] = config["write_alias"]
        if not es_client.cat.aliases(name=config["write_alias"]):
            logger.error(f"Write alias '{config['write_alias']}' is missing")
            raise SystemExit(1)
    else:
        if es_client.indices.exists(config["index_name"]):
            logger.error(
                f"Data load into existing index. Change index name or run an incremental load"
            )
            raise SystemExit(1)

    if config["starting_date"] < config["initial_datetime"]:
        logger.error(
            f"--start-datetime is too early. Set no earlier than {config['initial_datetime']}"
        )
        raise SystemExit(1)

    return config
Пример #3
0
    def complete_process(self) -> None:
        client = instantiate_elasticsearch_client()
        if self.config["create_new_index"]:
            set_final_index_config(client, self.config["index_name"])
            if self.config["skip_delete_index"]:
                logger.info(format_log("Skipping deletion of old indices"))
            else:
                logger.info(format_log("Closing old indices and adding aliases"))
                swap_aliases(client, self.config)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(client, self.config["index_name"])
            logger.info(
                format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load")
            )
            update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
Пример #4
0
 def run_deletes(self) -> None:
     logger.info(format_log("Processing deletions"))
     client = instantiate_elasticsearch_client()
     if self.config["data_type"] == "award":
         deleted_awards(client, self.config)
     elif self.config["data_type"] == "transaction":
         deleted_transactions(client, self.config)
     else:
         raise RuntimeError(f"No delete function implemented for type {self.config['data_type']}")
Пример #5
0
    def dispatch_tasks(self) -> None:
        _abort = Event()  # Event which when set signals an error occured in a subprocess
        parellel_procs = self.config["processes"]
        with Pool(parellel_procs, maxtasksperchild=1, initializer=init_shared_abort, initargs=(_abort,)) as pool:
            pool.map(extract_transform_load, self.tasks)

        msg = f"Total documents indexed: {total_doc_success.value}, total document fails: {total_doc_fail.value}"
        logger.info(format_log(msg))

        if _abort.is_set():
            raise RuntimeError("One or more partitions failed!")
Пример #6
0
def extract_transform_load(task: TaskSpec) -> None:
    if abort.is_set():
        logger.warning(format_log(f"Skipping partition #{task.partition_number} due to previous error", name=task.name))
        return

    start = perf_counter()
    msg = f"Started processing on partition #{task.partition_number}: {task.name}"
    logger.info(format_log(msg, name=task.name))

    client = instantiate_elasticsearch_client()
    try:
        records = task.transform_func(task, extract_records(task))
        if abort.is_set():
            f"Prematurely ending partition #{task.partition_number} due to error in another process"
            logger.warning(format_log(msg, name=task.name))
            return
        success, fail = load_data(task, records, client)
        with total_doc_success.get_lock():
            total_doc_success.value += success
        with total_doc_fail.get_lock():
            total_doc_fail.value += fail
    except Exception:
        if abort.is_set():
            msg = f"Partition #{task.partition_number} failed after an error was previously encountered"
            logger.warning(format_log(msg, name=task.name))
        else:
            logger.error(format_log(f"{task.name} failed!", name=task.name))
            abort.set()
    else:
        msg = f"Partition #{task.partition_number} was successfully processed in {perf_counter() - start:.2f}s"
        logger.info(format_log(msg, name=task.name))
Пример #7
0
    def handle(self, *args, **options):
        elasticsearch_client = instantiate_elasticsearch_client()
        config = parse_cli_args(options, elasticsearch_client)

        start = perf_counter()
        logger.info(format_log(f"Starting script\n{'=' * 56}"))
        start_msg = "target index: {index_name} | Starting from: {starting_date}"
        logger.info(format_log(start_msg.format(**config)))

        ensure_view_exists(config["sql_view"], force=True)
        error_addition = ""
        loader = Controller(config)

        if config["is_incremental_load"]:
            toggle_refresh_off(elasticsearch_client,
                               config["index_name"])  # Turned back on at end.

        try:
            if config["process_deletes"]:
                loader.run_deletes()

            if not config["deletes_only"]:
                loader.prepare_for_etl()
                loader.dispatch_tasks()
        except Exception as e:
            logger.error(f"{str(e)}")
            error_addition = "before encountering a problem during execution.... "
            raise SystemExit(1)
        else:
            loader.complete_process()
            if config["drop_db_view"]:
                logger.info(
                    format_log(f"Dropping SQL view '{config['sql_view']}'"))
                drop_etl_view(config["sql_view"], True)
        finally:
            msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|"
            headers = f"{'-' * (len(msg) - 2)} |"
            logger.info(format_log(headers))
            logger.info(format_log(msg))
            logger.info(format_log(headers))

        # Used to help pipeline determine when job passed but needs attention
        if config["raise_status_code_3"]:
            raise SystemExit(3)