def create_local_file(self, award_type, source, agency_code, generate_since): """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """ logger.info('Generating CSV file with creations and modifications') # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S%f') working_dir = '{}_{}_delta_gen_{}/'.format(settings.CSV_LOCAL_PATH, agency_code, timestamp) if not os.path.exists(working_dir): os.mkdir(working_dir) source_name = '{}_{}_delta'.format( award_type, VALUE_MAPPINGS['transactions']['download_name']) source_path = os.path.join(working_dir, '{}.csv'.format(source_name)) # Create a unique temporary file with the raw query raw_quoted_query = generate_raw_quoted_query( source.row_emitter(None)) # None requests all headers csv_query_annotated = self.apply_annotations_to_sql( raw_quoted_query, source.human_names) (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix='bd_sql_', dir='/tmp') with open(temp_sql_file_path, 'w') as file: file.write('\\copy ({}) To STDOUT with CSV HEADER'.format( csv_query_annotated)) # Generate the csv with \copy cat_command = subprocess.Popen(['cat', temp_sql_file_path], stdout=subprocess.PIPE) subprocess.check_output([ 'psql', '-o', source_path, os.environ['DOWNLOAD_DATABASE_URL'], '-v', 'ON_ERROR_STOP=1' ], stdin=cat_command.stdout, stderr=subprocess.STDOUT) # Append deleted rows to the end of the file self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since) if csv_row_count(source_path, has_header=True) > 0: # Split the CSV into multiple files and zip it up zipfile_path = '{}{}_{}_Delta_{}.zip'.format( settings.CSV_LOCAL_PATH, agency_code, award_type, datetime.strftime(date.today(), '%Y%m%d')) logger.info('Creating compressed file: {}'.format( os.path.basename(zipfile_path))) split_and_zip_csvs(zipfile_path, source_path, source_name) else: zipfile_path = None os.close(temp_sql_file) os.remove(temp_sql_file_path) shutil.rmtree(working_dir) return zipfile_path
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 10) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 award_category = AWARD_DESC_CATEGORIES[awd_cat_idx] index = '{}-{}-{}'.format(settings.TRANSACTIONS_INDEX_ROOT, award_category, fy) filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = csv_row_count(filename) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) if self.config['provide_deleted']: s3_delete_process = Process(target=deleted_transactions, args=(ES, self.config)) download_proccess = Process(target=download_db_records, args=(download_queue, es_ingest_queue, self.config)) es_index_process = Process(target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config)) download_proccess.start() if self.config['provide_deleted']: s3_delete_process.start() while s3_delete_process.is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) es_index_process.start() if self.config['provide_deleted']: s3_delete_process.join() download_proccess.join() es_index_process.join()
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = csv_row_count(filename) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append( Process(name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append( Process(name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append( Process(name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit elif all([not x.is_alive() for x in process_list]): printf({ 'msg': 'All ETL processes completed execution with no error codes' }) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)