예제 #1
0
    def update_stat_log(self, source_file_name):

        # OLD: get references to stage database and catalog schema
        # data_stage_database = udp.udp_stage_database
        # data_catalog_schema = udp.udp_catalog_schema

        db = database.MSSQL(self.config(self.project.database))

        conn = db.conn
        # cursor = conn.cursor()

        db_conn = database.Database('mssql', conn)
        db_conn.use_database('udp_stage')

        # TODO: Will json_pickle restore datetime values without explict conversion ???
        # TODO: Wrapper for stat insert that does intersection of json and target record's schema
        #       and explicitly inserts specific column/value pairs they have in common; this will
        #       require that our json send includes column names, not just rows of column values !!!!

        # extract job.log/last_job.log from capture zip and merge these into stat_log table
        job_log_data = read_archived_file(source_file_name,
                                          'job.log',
                                          default=None)
        if job_log_data:
            job_log_json = json.loads(job_log_data)
            for row in job_log_json:
                row['start_time'] = iso_to_datetime(row['start_time']).datetime
                row['end_time'] = iso_to_datetime(row['end_time']).datetime

                # skip capture stats which only have intermediate end_time and run_time values
                # next capture file will include an accurate version of this stat in last_job.job file
                if row['stat_name'] != 'capture':
                    db_conn.insert_into_table('udp_catalog', 'stat_log', **row)

        # if 'last_job.log' in archive.namelist():
        job_log_data = read_archived_file(source_file_name,
                                          'last_job.log',
                                          default=None)
        if job_log_data:
            last_job_log_json = json.loads(job_log_data)
            for row in last_job_log_json:
                row['start_time'] = iso_to_datetime(row['start_time']).datetime
                row['end_time'] = iso_to_datetime(row['end_time']).datetime
                if row['stat_name'] in ('capture', 'compress', 'upload'):
                    db_conn.insert_into_table('udp_catalog', 'stat_log', **row)
예제 #2
0
    def main(self):
        logger.info(
            f"{now():%Y-%m-%d %H:%M:%S}: Polling for queued files to process ..."
        )

        # connect to target database
        database_target = self.project.database_target
        db_resource = self.config(database_target)
        db = database.MSSQL(db_resource)
        # db.debug_flag = True
        self.target_db_conn = database.Database("mssql", db.conn)
        self.target_db_conn.use_database("udp_stage")

        # process queued files, then exit and let daemon scheduler loop handle polling
        have_archive_file_to_process = True
        while have_archive_file_to_process:
            # keep processing archived files until there are no more to process
            have_archive_file_to_process = self.process_next_file_to_stage()
예제 #3
0
def setup(config):
    # Future:
    # - Drive this setup from a project_setup.ini file
    # - Add activity_log (vs job_log/stat_log) references
    # - Debug should also enable/disable SQL output and timing/record count stats

    db_resource = config("database:amc_dsg_udp_stage")
    db = database.MSSQL(db_resource)
    db_conn = database.Database("mssql", db.conn)

    # create data stage database if not present; then use
    db_conn.create_database(udp_stage_database)
    db_conn.use_database(udp_stage_database)

    # create udp sys schema if not present
    db_conn.create_schema(udp_sys_schema)

    # create udp sys tables if not present
    db_conn.create_named_table(udp_sys_schema, "nst_lookup")
    db_conn.create_named_table(udp_sys_schema, "job_log")
    db_conn.create_named_table(udp_sys_schema, "stat_log")
    db_conn.create_named_table(udp_sys_schema, "table_log")
    db_conn.create_named_table(udp_sys_schema, "stage_arrival_queue")
    db_conn.create_named_table(udp_sys_schema, "stage_pending_queue")
예제 #4
0
def main():

    # bootstrap configuration settings
    bootstrap = config.Bootstrap()
    bootstrap.debug_flag = False
    bootstrap.load('conf/init.ini')
    bootstrap.load('conf/bootstrap.ini')

    # project
    project_name = 'udp_aws_stage_01_etl'
    project_config = config.Config(f'conf/{project_name}.project',
                                   config.ProjectSection, bootstrap)
    project_config.debug_flag = False
    project_config.dump()
    project_object = project_config.sections['stage_project']

    # make sure core database environment in place
    udp.setup()

    # get references to stage database and catalog schema
    # udp_stage_database = udp.udp_stage_database
    # udp_catalog_schema = udp.udp_catalog_schema

    # connections
    database_connect_name = f'{project_object.database}'
    cloud_connect_name = f'{project_object.cloud}'

    # SQL Server
    connect_config = config.Config('conf/_connect.ini',
                                   config.ConnectionSection, bootstrap)
    sql_server_connect = connect_config.sections[database_connect_name]

    db = database.MSSQL(sql_server_connect)
    db.debug_flag = True
    conn = db.conn
    # cursor = conn.cursor()

    # create udp_staging database if not present; then use
    db_conn = database.Database('mssql', conn)
    db_conn.use_database('udp_stage')

    # Todo: These names should come from project file
    # Todo: queue_name should be input_queue_name, output_queue_name
    # archive_objectstore_name = 'udp-s3-archive-sandbox'
    # queue_name = 'udp-sqs-archive-sandbox'
    archive_objectstore_name = f'{project_object.archive_objectstore}'
    archive_queue_name = f'{project_object.archive_queue}'
    stage_queue_name = f'{project_object.stage_queue}'

    # get connection info
    connect_config = config.Config('conf/_connect.ini',
                                   config.ConnectionSection, bootstrap)
    cloud_connection = connect_config.sections[cloud_connect_name]

    archive_object_store = cloud.ObjectStore(archive_objectstore_name,
                                             cloud_connection)
    if project_object.archive_queue:
        archive_queue = cloud.Queue(archive_queue_name, cloud_connection)
    else:
        archive_queue = None

    if project_object.stage_queue:
        stage_queue = cloud.Queue(stage_queue_name, cloud_connection)
    else:
        stage_queue = None

    # main poll loop
    while True:
        # logger.info(f'{datetime.datetime.today():%Y-%m-%d %H:%M:%S}: Polling for archive updates ...')
        archive_file_found = process_next_file_to_stage(
            db_conn, archive_object_store, stage_queue)

        # clear archive queue messages
        # TODO: Drop archive queue except as a diagnostic monitoring tool?
        if archive_queue:
            response = archive_queue.get()
            notification = cloud.ObjectStoreNotification(response)
            if notification.message_id:
                if not notification.objectstore_name:
                    # unexpected notification - log it, then delete it
                    logger.debug(f'Ignoring message: {notification.message}')
                    archive_queue.delete(notification.message_id)
                else:
                    # archive_file_name = stage_archive_file(archive_object_store, notification)
                    archive_queue.delete(notification.message_id)

        # poll if we didn't find an archived file, otherwise keep processing
        if not archive_file_found:
            # poll
            time.sleep(int(project_object.poll_frequency))
예제 #5
0
    def main(self):
        db = None
        try:
            # get job id and table history
            job_history_file_name = f'{self.state_folder_name}/capture.job'
            job_history = JobHistory(job_history_file_name)
            job_history.load()
            job_id = job_history.job_id
            self.job_id = job_id
            logger.info(f'\nCapture job {job_id} for {self.namespace} ...')

            # track job (and table) stats
            self.stats = Stats(f'{self.work_folder_name}/job.log',
                               namespace=self.namespace,
                               job_id=job_id)
            self.stats.start('capture', 'job')

            # track overall job row count and file size
            self.job_row_count = 0
            self.job_file_size = 0

            # create/clear job folders
            create_folder(self.state_folder_name)
            clear_folder(self.work_folder_name)
            clear_folder(self.publish_folder_name)

            # _connect to source database
            db = None
            db_engine = None
            if self.database.platform == 'postgresql':
                db = database.PostgreSQL(self.database)
                db_engine = database.Database('postgresql', db.conn)

            elif self.database.platform == 'mssql':
                db = database.MSSQL(self.database)
                db_engine = database.Database('mssql', db.conn)

            # cursor = db.conn.cursor()

            # determine current timestamp for this job's run

            # get current_timestamp() from source database with step back and fast forward logic
            current_timestamp = self.current_timestamp(db_engine)

            # process all tables
            self.stats.start('extract', 'step')
            for table_name, table_object in self.table_config.sections.items():
                table_history = job_history.get_table_history(table_name)
                self.process_table(db, db_engine, self.database.schema,
                                   table_name, table_object, table_history,
                                   current_timestamp)
            self.stats.stop('extract', self.job_row_count, self.job_file_size)

            # save interim job stats to work_folder before compressing this folder
            self.stats.stop('capture', self.job_row_count, self.job_file_size)
            self.stats.save()

            # compress work_folder files to publish_folder zip file
            self.compress_work_folder()

            # upload publish_folder zip file
            self.upload_to_objectstore()

            # save final stats for complete job run
            self.stats.stop('capture', self.job_row_count, self.job_file_size)
            self.stats.save(f'{self.state_folder_name}/last_job.log')
            self.stats.save()

            # update job_id and table histories
            job_history.save()

            # compress capture_state and save to capture objectstore for recovery
            self.save_recovery_state_file()

            # update schedule's poll message
            last_job_info = f'last job {self.job_id} on {datetime.datetime.now():%Y-%m-%d %H:%M}'
            schedule_info = f'schedule: {self.schedule}'
            self.schedule.poll_message = f'{script_name()}({self.namespace}), {last_job_info}, {schedule_info}'

        # force unhandled exceptions to be exposed
        except Exception:
            logger.exception('Unexpected exception')
            raise

        finally:
            # explicitly close database connection when finished with job
            with contextlib.suppress(Exception):
                db.conn.close()
예제 #6
0
    def main(self):
        db = None
        try:
            # track dataset name for naming generated files and folders
            self.dataset_name = self.namespace.dataset

            # get job id and table history
            job_history_file_name = f'{self.state_folder}/capture.job'
            job_history = JobHistory(job_history_file_name)
            job_history.load()
            job_id = job_history.job_id
            self.job_id = job_id
            logger.info(f'\nCapture job {job_id} for {self.dataset_name} ...')
            self.progress_message(f'starting job {job_id} ...')

            # track job (and table) metrics
            dataset_id = self.namespace.dataset
            self.events = Events(f'{self.work_folder}/job.log',
                                 dataset_id=dataset_id,
                                 job_id=job_id)
            self.events.start('capture', 'job')

            # track overall job row count and file size
            self.job_row_count = 0
            self.job_data_size = 0

            # create/clear job folders
            create_folder(self.state_folder)
            clear_folder(self.work_folder)
            clear_folder(self.publish_folder)

            # connect to source database
            self.database = self.config(self.project.database_source)
            if self.database.platform == 'postgresql':
                db = database.PostgreSQL(self.database)
                db_engine = database.Database('postgresql', db.conn)
            elif self.database.platform == 'mssql':
                db = database.MSSQL(self.database)
                db_engine = database.Database('mssql', db.conn)
            else:
                raise NotImplementedError(
                    f'Unknown database platform ({self.database.platform})')

            # determine current timestamp for this job's run

            # get current_timestamp() from source database with step back and fast forward logic
            current_timestamp = self.current_timestamp(db_engine)

            # process all tables
            self.events.start('extract', 'step')

            # build dict of table objects indexed by table name
            self.tables = dict()
            for section_name, section_object in self.config.sections.items():
                if section_name.startswith('table:'):
                    table_name = section_name.partition(':')[2]
                    self.tables[table_name] = section_object

            # extract data from each table
            for table_name, table_object in self.tables.items():
                table_history = job_history.get_table_history(table_name)

                # get current_sequence from source database
                if table_object.cdc == 'sequence':
                    current_sequence = db_engine.current_sequence(table_name)
                else:
                    current_sequence = 0

                self.process_table(db, db_engine, self.database.schema,
                                   table_name, table_object, table_history,
                                   current_timestamp, current_sequence)
            self.events.stop('extract', self.job_row_count, self.job_data_size)

            # save interim job metrics to work_folder before compressing this folder
            self.events.stop('capture', self.job_row_count, self.job_data_size)
            self.events.save()

            # compress work_folder files to publish_folder zip file
            self.compress_work_folder()

            # upload publish_folder zip file
            self.upload_to_blobstore()

            # save final metrics for complete job run
            self.events.stop('capture', self.job_row_count, self.job_data_size)
            self.events.save(f'{self.state_folder}/last_job.log')
            self.events.save()

            # update job_id and table histories
            if not self.option('notransfer'):
                # only save job history if we're transferring data to landing
                job_history.save()

            # compress capture_state and save to capture blobstore for recovery
            self.save_recovery_state_file()

            # update schedule's poll message
            last_job_info = f'last job {self.job_id} on {datetime.datetime.now():%Y-%m-%d %H:%M}'
            schedule_info = f'schedule: {self.schedule}'
            self.schedule.poll_message = f'{script_name()}({self.dataset_name}), {last_job_info}, {schedule_info}'

        # force unhandled exceptions to be exposed
        except Exception:
            logger.exception('Unexpected exception')
            raise

        finally:
            # explicitly close database connection when finished with job
            with contextlib.suppress(Exception):
                db.conn.close()
예제 #7
0
    def main(self):
        if len(sys.argv) == 1:
            dataset_table_parameter = ''
        else:
            dataset_table_parameter = sys.argv[1].lower()

        if not dataset_table_parameter:
            print('\nError: Must specify <dataset>.<table> to drop.\n')
        else:
            # TODO: Rename dataset_name > dataset_id
            # parse out dataset and table name
            dataset_name, _, table_name = dataset_table_parameter.partition(
                '.')

            # load dataset's capture.job
            job_history_file_name = f'{self.session_folder}/{dataset_name}/state/capture.job'
            job_history = JobHistory(job_history_file_name)
            print(f'job_history_file_name = {job_history_file_name}')
            job_history.load()
            job_history.dump()
            if table_name not in job_history.tables:
                print(
                    f'Error: Table does not exist ({dataset_table_parameter})')
            else:
                # delete table entry
                del job_history.tables[table_name]
                job_history.save(is_maintenance=True)

            # TODO: Read a consistent udp_stage database key
            # TODO: App/utility validation should be a method that validates parameters, connections, dataset/table existences
            # TODO: Print and error should be methods()
            # TODO: Log start/stop, status (success, fail), duration
            # TODO: Database config should drive database.<engine> vs hard coding these statements
            # TODO: Test mode - verifies dataset.table exists, returns error code 0 if yes, 1 if no
            # TODO: --help mode with descriptive text in a Linux output format
            db_resource = self.config('database:amc_dsg_udp_stage_test')
            db = database.MSSQL(db_resource)
            self.target_db_conn = database.Database('mssql', db.conn)
            self.target_db_conn.use_database('udp_stage')

            # drop the target table
            if not self.target_db_conn.does_table_exist(
                    dataset_name, table_name):
                print(
                    f'Error: Table does not exist ({dataset_table_parameter})')
            else:
                print(f'Dropping {dataset_table_parameter} ...')
                self.target_db_conn.drop_table(dataset_name, table_name)
                if self.target_db_conn.does_table_exist(
                        dataset_name, table_name):
                    print(
                        f'Failure: Unable to drop table ({dataset_table_parameter})'
                    )
                else:
                    print(f'Successfully dropped {dataset_table_parameter}')

                    # optionally remove the table from stat_log history
                    if self.option('clear-history'):
                        print(
                            f'Clearing {dataset_table_parameter} history from job log ...'
                        )
                        # TODO: delete from stat_log where dataset ..., table ..., step ...

            # cleanup
            db.conn.close()