def handle(self, *args, **options): """ Script execution of custom code starts in this method""" start = perf_counter() printf({"msg": "Starting script\n{}".format("=" * 56)}) self.transform_cli_arguments(options) start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}" printf({"msg": start_msg.format(**self.config)}) self.controller() if self.config["is_incremental_load"]: printf({ "msg": "Updating Last Load record with {}".format( self.config["processing_start_datetime"]) }) update_last_load_date("es_transactions", self.config["processing_start_datetime"]) printf({ "msg": "---------------------------------------------------------------" }) printf({ "msg": "Script completed in {} seconds".format(perf_counter() - start) }) printf({ "msg": "---------------------------------------------------------------" })
def complete_process(self) -> None: if self.config["create_new_index"]: set_final_index_config(self.elasticsearch_client, self.config["index_name"]) if self.config["skip_delete_index"]: logger.info(format_log("Skipping deletion of old indices")) else: logger.info( format_log("Closing old indices and adding aliases")) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: logger.info(format_log("Taking snapshot")) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: toggle_refresh_on(self.elasticsearch_client, self.config["index_name"]) logger.info( format_log( f"Storing datetime {self.config['processing_start_datetime']} for next incremental load" )) update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])
def nightly_loader(self, start_date): logger.info("==== Starting FPDS nightly data load ====") if start_date: date = start_date date = datetime.strptime(date, "%Y-%m-%d").date() else: default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1) date = get_last_load_date("fpds", default=default_last_load_date).date() processing_start_datetime = datetime.now(timezone.utc) logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of new/modified FPDS data ID list", logger.info): ids_to_insert = self.get_fpds_transaction_ids(date=date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) self.perform_load( ids_to_delete, ids_to_insert, ) # Update the date for the last time the data load was run update_last_load_date("fpds", processing_start_datetime) logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def handle(self, *args, **options): script_start_time = datetime.now(timezone.utc) periods = retrieve_recent_periods() # Using `script_start_time` as a default, so no awards will be touched the first time this script # is run. The assumption is that awards are up to date at the time the script is deployed. After # this runs the first time, a date will be populated in the database. self.last_load_date = get_last_load_date("touch_last_period_awards", default=script_start_time) logger.info( f"Using {script_start_time} to determine if awards should be touched." ) total_records_updated = 0 total_records_updated += self.touch_period_awards_if_behind( periods["this_month"]) total_records_updated += self.touch_period_awards_if_behind( periods["this_quarter"]) update_last_load_date("touch_last_period_awards", script_start_time) logger.info( f"Found {total_records_updated:,} award records to update in Elasticsearch" ) # Return will be captured as stdout in Jenkins job return str(total_records_updated)
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: ids = None afa_ids = None start_datetime = None end_datetime = None else: ids = options["ids"] afa_ids = set(options["afa_ids"]) if options["afa_id_file"]: afa_ids = tuple( afa_ids | read_afa_ids_from_file(options["afa_id_file"])) start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any( (reload_all, ids, afa_ids, start_datetime, end_datetime)) if is_incremental_load: start_datetime = get_incremental_load_start_datetime() logger.info( f"Processing data for FABS starting from {start_datetime} (includes offset)" ) # We only perform deletes with incremental loads. with timer("obtaining delete records", logger.info): delete_records = retrieve_deleted_fabs_transactions( start_datetime, end_datetime) ids_to_delete = [ item for sublist in delete_records.values() for item in sublist if item ] ids_to_delete = get_delete_pks_for_afa_keys(ids_to_delete) logger.info(f"{len(ids_to_delete):,} delete ids found in total") with timer("retrieving IDs of FABS to process", logger.info): ids_to_upsert = get_fabs_transaction_ids(ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions( ids_to_delete) if is_incremental_load else [] upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: logger.info( f"Storing {processing_start_datetime} for the next incremental run" ) update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): logger.info("==== Starting FPDS nightly data load ====") if options.get("date"): date = options.get("date")[0] date = datetime.strptime(date, "%Y-%m-%d").date() else: default_last_load_date = datetime.now( timezone.utc) - timedelta(days=1) date = get_last_load_date("fpds", default=default_last_load_date).date() processing_start_datetime = datetime.now(timezone.utc) logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") with timer("retrieval of new/modified FPDS data ID list", logger.info): total_insert = self.get_fpds_transaction_ids(date=date) if len(total_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(total_insert) # Update Awards based on changed FPDS records with timer( "updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update FPDS-specific Awards based on the info in child transactions with timer( "updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FPDS records with timer("updating award category variables", logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture") # Update the date for the last time the data load was run update_last_load_date("fpds", processing_start_datetime) logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def load_executive_compensation(db_cursor, date, start_date): logger.info("Getting DUNS/Exec Comp data from broker based on the last pull date of %s..." % str(date)) # Get first page db_cursor.execute(EXEC_COMP_QUERY, [date]) exec_comp_query_dict = dictfetchall(db_cursor) total_rows = len(exec_comp_query_dict) logger.info('Updating Executive Compensation Data, {} rows coming from the Broker...'.format(total_rows)) start_time = datetime.now(timezone.utc) for index, row in enumerate(exec_comp_query_dict, 1): if not (index % 100): logger.info('Loading row {} of {} ({})'.format(str(index), str(total_rows), datetime.now() - start_time)) leo_update_dict = { "officer_1_name": row['high_comp_officer1_full_na'], "officer_1_amount": row['high_comp_officer1_amount'], "officer_2_name": row['high_comp_officer2_full_na'], "officer_2_amount": row['high_comp_officer2_amount'], "officer_3_name": row['high_comp_officer3_full_na'], "officer_3_amount": row['high_comp_officer3_amount'], "officer_4_name": row['high_comp_officer4_full_na'], "officer_4_amount": row['high_comp_officer4_amount'], "officer_5_name": row['high_comp_officer5_full_na'], "officer_5_amount": row['high_comp_officer5_amount'], } any_data = False for attr, value in leo_update_dict.items(): if value and value != "": any_data = True break if not any_data: continue duns_number = row['awardee_or_recipient_uniqu'] # Deal with multiples that we have in our LE table legal_entities = LegalEntity.objects.filter(recipient_unique_id=duns_number) if not legal_entities.exists(): logger.info('No record in data store for DUNS {}. Skipping...'.format(duns_number)) for le in legal_entities: leo, _ = LegalEntityOfficers.objects.get_or_create(legal_entity=le) for attr, value in leo_update_dict.items(): if value == "": value = None setattr(leo, attr, value) leo.save() # Update the date for the last time the data load was run update_last_load_date("exec_comp", start_date)
def handle(self, *args, **options): logger.info("==== Starting FPDS nightly data load ====") if options.get("date"): date = options.get("date")[0] date = datetime.strptime(date, "%Y-%m-%d").date() else: default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1) date = get_last_load_date("fpds", default=default_last_load_date).date() processing_start_datetime = datetime.now(timezone.utc) logger.info("Processing data for FPDS starting from %s" % date) with timer("retrieval of deleted FPDS IDs", logger.info): ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date) if len(ids_to_delete) > 0: with timer("deletion of all stale FPDS data", logger.info): self.delete_stale_fpds(ids_to_delete=ids_to_delete) else: logger.info("No FPDS records to delete at this juncture") with timer("retrieval of new/modified FPDS data ID list", logger.info): total_insert = self.get_fpds_transaction_ids(date=date) if len(total_insert) > 0: # Add FPDS records with timer("insertion of new FPDS data in batches", logger.info): self.insert_all_new_fpds(total_insert) # Update Awards based on changed FPDS records with timer("updating awards to reflect their latest associated transaction info", logger.info): update_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update FPDS-specific Awards based on the info in child transactions with timer("updating contract-specific awards to reflect their latest transaction info", logger.info): update_contract_awards(tuple(AWARD_UPDATE_ID_LIST)) # Update AwardCategories based on changed FPDS records with timer("updating award category variables", logger.info): update_award_categories(tuple(AWARD_UPDATE_ID_LIST)) # Check the linkages from file C to FPDS records and update any that are missing with timer("updating C->D linkages", logger.info): update_c_to_d_linkages("contract") else: logger.info("No FPDS records to insert or modify at this juncture") # Update the date for the last time the data load was run update_last_load_date("fpds", processing_start_datetime) logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") do_not_log_deletions = options["do_not_log_deletions"] # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: submission_ids = None afa_ids = None start_datetime = None end_datetime = None else: submission_ids = tuple(options["submission_ids"] ) if options["submission_ids"] else None afa_ids = read_afa_ids_from_file( options['afa_id_file']) if options['afa_id_file'] else None start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any((reload_all, submission_ids, afa_ids, start_datetime, end_datetime)) if is_incremental_load: last_load_date = get_last_load_date() submission_ids = get_new_submission_ids(last_load_date) logger.info("Processing data for FABS starting from %s" % last_load_date) if is_incremental_load and not submission_ids: logger.info("No new submissions. Exiting.") else: with timer("obtaining delete records", logger.info): ids_to_delete = get_fabs_records_to_delete( submission_ids, afa_ids, start_datetime, end_datetime) with timer("retrieving/diff-ing FABS Data", logger.info): ids_to_upsert = get_fabs_transaction_ids( submission_ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions( ids_to_delete, do_not_log_deletions) upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def complete_process(self) -> None: if self.config["create_new_index"]: printf({"msg": "Closing old indices and adding aliases"}) set_final_index_config(self.elasticsearch_client, self.config["index_name"]) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: msg = "Storing datetime {} for next incremental load" printf({"msg": msg.format(self.config["processing_start_datetime"])}) update_last_load_date("es_transactions", self.config["processing_start_datetime"])
def complete_process(self) -> None: client = instantiate_elasticsearch_client() if self.config["create_new_index"]: set_final_index_config(client, self.config["index_name"]) if self.config["skip_delete_index"]: logger.info(format_log("Skipping deletion of old indices")) else: logger.info(format_log("Closing old indices and adding aliases")) swap_aliases(client, self.config) if self.config["is_incremental_load"]: toggle_refresh_on(client, self.config["index_name"]) logger.info( format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load") ) update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
def cleanup(self) -> None: """Finalize the execution and cleanup for the next script run""" logger.info(f"Processed {self.upsert_records:,} transction records (insert/update)") if self.successful_run and (self.is_incremental or self.options["reload_all"]): logger.info("Updated last run time for next incremental load") update_last_load_date(self.last_load_record, self.start_time) if hasattr(self, "file_path") and self.file_path.exists(): # If the script fails before the file is created, skip # If the file still exists, remove self.file_path.unlink() if self.successful_run: logger.info(f"Loading {self.destination_table_name} completed successfully") else: logger.info("Failed state on exit") raise SystemExit(1)
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: afa_ids = None start_datetime = None end_datetime = None else: afa_ids = read_afa_ids_from_file( options["afa_id_file"]) if options["afa_id_file"] else None start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any( (reload_all, afa_ids, start_datetime, end_datetime)) if is_incremental_load: start_datetime = get_last_load_date() logger.info("Processing data for FABS starting from %s" % start_datetime) with timer("obtaining delete records", logger.info): delete_records = retrieve_deleted_fabs_transactions( start_datetime, end_datetime) ids_to_delete = [ item for sublist in delete_records.values() for item in sublist if item ] with timer("retrieving/diff-ing FABS Data", logger.info): ids_to_upsert = get_fabs_transaction_ids(afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions(ids_to_delete) upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): processing_start_datetime = datetime.now(timezone.utc) logger.info("Starting FABS data load script...") do_not_log_deletions = options["do_not_log_deletions"] # "Reload all" supersedes all other processing options. reload_all = options["reload_all"] if reload_all: submission_ids = None afa_ids = None start_datetime = None end_datetime = None else: submission_ids = tuple(options["submission_ids"]) if options["submission_ids"] else None afa_ids = read_afa_ids_from_file(options['afa_id_file']) if options['afa_id_file'] else None start_datetime = options["start_datetime"] end_datetime = options["end_datetime"] # If no other processing options were provided than this is an incremental load. is_incremental_load = not any((reload_all, submission_ids, afa_ids, start_datetime, end_datetime)) if is_incremental_load: last_load_date = get_last_load_date() submission_ids = get_new_submission_ids(last_load_date) logger.info("Processing data for FABS starting from %s" % last_load_date) if is_incremental_load and not submission_ids: logger.info("No new submissions. Exiting.") else: with timer("obtaining delete records", logger.info): ids_to_delete = get_fabs_records_to_delete(submission_ids, afa_ids, start_datetime, end_datetime) with timer("retrieving/diff-ing FABS Data", logger.info): ids_to_upsert = get_fabs_transaction_ids(submission_ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions(ids_to_delete, do_not_log_deletions) upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!")
def handle(self, *args, **options): # Record script execution start time to update the FPDS last updated date in DB as appropriate update_time = datetime.now(timezone.utc) if options["reload_all"]: self.load_fpds_incrementally(None) elif options["date"]: self.load_fpds_incrementally(options["date"]) elif options["ids"]: self.modified_award_ids.extend( load_fpds_transactions(options["ids"])) elif options["file"]: self.load_fpds_from_file(options["file"]) elif options["since_last_load"]: last_load = get_last_load_date("fpds") if not last_load: raise ValueError( "No last load date for FPDS stored in the database") self.load_fpds_incrementally(last_load) self.update_award_records(awards=self.modified_award_ids, skip_cd_linkage=False) logger.info(f"Script took {datetime.now(timezone.utc) - update_time}") if failed_ids: failed_id_str = ", ".join([str(id) for id in failed_ids]) logger.error( f"The following detached_award_procurement_ids failed to load: [{failed_id_str}]" ) raise SystemExit(1) if options["reload_all"] or options["since_last_load"]: # we wait until after the load finishes to update the load date because if this crashes we'll need to load again update_last_load_date("fpds", update_time) logger.info(f"Successfully Completed")
def complete_process(self) -> None: if self.config["create_new_index"]: set_final_index_config(self.elasticsearch_client, self.config["index_name"]) if self.config["skip_delete_index"]: printf({"msg": "Skipping deletion of old indices"}) else: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: printf({ "msg": f"Storing datetime {self.config['processing_start_datetime']} for next incremental load" }) update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])