예제 #1
0
    def create_local_file(self, award_type, source, agency_code,
                          generate_since):
        """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """
        logger.info('Generating CSV file with creations and modifications')

        # Create file paths and working directory
        timestamp = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S%f')
        working_dir = '{}_{}_delta_gen_{}/'.format(settings.CSV_LOCAL_PATH,
                                                   agency_code, timestamp)
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        source_name = '{}_{}_delta'.format(
            award_type, VALUE_MAPPINGS['transactions']['download_name'])
        source_path = os.path.join(working_dir, '{}.csv'.format(source_name))

        # Create a unique temporary file with the raw query
        raw_quoted_query = generate_raw_quoted_query(
            source.row_emitter(None))  # None requests all headers
        csv_query_annotated = self.apply_annotations_to_sql(
            raw_quoted_query, source.human_names)
        (temp_sql_file,
         temp_sql_file_path) = tempfile.mkstemp(prefix='bd_sql_', dir='/tmp')
        with open(temp_sql_file_path, 'w') as file:
            file.write('\\copy ({}) To STDOUT with CSV HEADER'.format(
                csv_query_annotated))

        # Generate the csv with \copy
        cat_command = subprocess.Popen(['cat', temp_sql_file_path],
                                       stdout=subprocess.PIPE)
        subprocess.check_output([
            'psql', '-o', source_path, os.environ['DOWNLOAD_DATABASE_URL'],
            '-v', 'ON_ERROR_STOP=1'
        ],
                                stdin=cat_command.stdout,
                                stderr=subprocess.STDOUT)

        # Append deleted rows to the end of the file
        self.add_deletion_records(source_path, working_dir, award_type,
                                  agency_code, source, generate_since)
        if csv_row_count(source_path, has_header=True) > 0:
            # Split the CSV into multiple files and zip it up
            zipfile_path = '{}{}_{}_Delta_{}.zip'.format(
                settings.CSV_LOCAL_PATH, agency_code, award_type,
                datetime.strftime(date.today(), '%Y%m%d'))
            logger.info('Creating compressed file: {}'.format(
                os.path.basename(zipfile_path)))
            split_and_zip_csvs(zipfile_path, source_path, source_name)
        else:
            zipfile_path = None

        os.close(temp_sql_file)
        os.remove(temp_sql_file_path)
        shutil.rmtree(working_dir)

        return zipfile_path
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            10)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                award_category = AWARD_DESC_CATEGORIES[awd_cat_idx]
                index = '{}-{}-{}'.format(settings.TRANSACTIONS_INDEX_ROOT,
                                          award_category, fy)
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = csv_row_count(filename)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        if self.config['provide_deleted']:
            s3_delete_process = Process(target=deleted_transactions,
                                        args=(ES, self.config))
        download_proccess = Process(target=download_db_records,
                                    args=(download_queue, es_ingest_queue,
                                          self.config))
        es_index_process = Process(target=es_data_loader,
                                   args=(ES, download_queue, es_ingest_queue,
                                         self.config))

        download_proccess.start()

        if self.config['provide_deleted']:
            s3_delete_process.start()
            while s3_delete_process.is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        es_index_process.start()

        if self.config['provide_deleted']:
            s3_delete_process.join()
        download_proccess.join()
        es_index_process.join()
예제 #3
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = csv_row_count(filename)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(
            Process(name='Download Proccess',
                    target=download_db_records,
                    args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(
            Process(name='ES Index Process',
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(
                Process(name='S3 Deleted Records Scrapper Process',
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    'msg':
                    'All ETL processes completed execution with no error codes'
                })
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'],
                          settings.ES_REPOSITORY)