def run_load_steps(self) -> None: download_queue = Queue() # Queue for jobs which need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest job_number = 0 for fiscal_year in self.config["fiscal_years"]: job_number += 1 index = self.config["index_name"] filename = str( self.config["directory"] / "{fy}_{type}.csv".format(fy=fiscal_year, type=self.config["load_type"]) ) new_job = DataJob(job_number, index, fiscal_year, filename) if Path(filename).exists(): Path(filename).unlink() download_queue.put(new_job) printf({"msg": "There are {} jobs to process".format(job_number)}) process_list = [ Process( name="Download Process", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), ), Process( name="ES Index Process", target=es_data_loader, args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config), ), ] process_list[0].start() # Start Download process if self.config["process_deletes"]: process_list.append( Process( name="S3 Deleted Records Scrapper Process", target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards, args=(self.elasticsearch_client, self.config), ) ) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"}) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit("Fatal error: review logs to determine why process died.") elif all([not x.is_alive() for x in process_list]): printf({"msg": "All ETL processes completed execution with no error codes"}) break
def run_load_steps(self) -> None: download_queue = Queue() # Queue for jobs which need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest updated_record_count = get_updated_record_count(self.config) printf({"msg": f"Found {updated_record_count:,} {self.config['load_type']} records to index"}) if updated_record_count == 0: jobs = 0 else: download_queue, jobs = self.create_download_jobs() printf({"msg": f"There are {jobs} jobs to process"}) process_list = [ Process( name="Download Process", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), ), Process( name="ES Index Process", target=es_data_loader, args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config), ), ] if updated_record_count != 0: # only run if there are data to process process_list[0].start() # Start Download process if self.config["process_deletes"]: process_list.append( Process( name="S3 Deleted Records Scrapper Process", target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards, args=(self.elasticsearch_client, self.config), ) ) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"}) sleep(7) # add a brief pause to make sure the deletes are processed in ES if updated_record_count != 0: process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit("Fatal error: review logs to determine why process died.") elif all([not x.is_alive() for x in process_list]): printf({"msg": "All ETL processes completed execution with no error codes"}) break
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_number = 0 for fy in self.config["fiscal_years"]: job_number += 1 index = self.config["index_name"] filename = "{dir}{fy}_transactions.csv".format( dir=self.config["directory"], fy=fy) new_job = DataJob(job_number, index, fy, filename) if os.path.exists(filename): os.remove(filename) download_queue.put(new_job) printf({"msg": "There are {} jobs to process".format(job_number)}) process_list = [] process_list.append( Process( name="Download Proccess", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), )) process_list.append( Process(name="ES Index Process", target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config["provide_deleted"]: process_list.append( Process(name="S3 Deleted Records Scrapper Process", target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ "msg": "Waiting to start ES ingest until S3 deletes are complete" }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ "msg": "All ETL processes completed execution with no error codes" }) break if self.config["reload_all"]: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(ES, self.config["index_name"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(ES, self.config["index_name"], settings.ES_REPOSITORY)
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append( Process(name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append( Process(name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append( Process(name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ 'msg': 'All ETL processes completed execution with no error codes' }) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count), 'job': new_job.name, 'f': 'Download'}) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append(Process( name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append(Process( name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append(Process( name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'}) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({'msg': 'All ETL processes completed execution with no error codes'}) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)