def _clean_data_worker(rows, temp_table, providers_config): log.info('Starting data cleaning worker') global_field_to_func = providers_config['*']['fields'] worker_conn = database_connect() log.info('Data cleaning worker connected to database') write_cur = worker_conn.cursor(cursor_factory=DictCursor) log.info('Cleaning {} rows'.format(len(rows))) tls_cache = {} start_time = time.time() for row in rows: # Map fields that need updating to their cleaning functions provider = row['provider'] _id = row['id'] if provider in providers_config: provider_field_to_func = providers_config[provider]['fields'] # Merge provider-local and global function field mappings fields_to_update = \ {**global_field_to_func, **provider_field_to_func} else: fields_to_update = global_field_to_func # Map fields to their cleaned data cleaned_data = {} for update_field in fields_to_update: dirty_value = row[update_field] if not dirty_value: continue cleaning_func = fields_to_update[update_field] if cleaning_func == CleanupFunctions.cleanup_url: clean = cleaning_func(url=dirty_value, tls_support=tls_cache) else: clean = cleaning_func(dirty_value) if clean: cleaned_data[update_field] = clean # Generate SQL update for all the fields we just cleaned update_field_expressions = [] for field in cleaned_data: update_field_expressions.append( '{field} = {cleaned}'.format( field=field, cleaned=cleaned_data[field] ) ) if len(update_field_expressions) > 0: update_query = ''' UPDATE {temp_table} SET {field_expressions} WHERE id = {_id} '''.format( temp_table=temp_table, field_expressions=', '.join(update_field_expressions), _id=_id ) write_cur.execute(update_query) log.info('TLS cache: {}'.format(tls_cache)) log.info('Worker committing changes...') worker_conn.commit() write_cur.close() worker_conn.close() end_time = time.time() total_time = end_time - start_time log.info('Worker finished batch in {}'.format(total_time)) return True
def promote_api_table( table: str, progress: multiprocessing.Value = None, ): """ This runs after ``refresh_api_table``. The process involves the following steps. 6. Recreate indices from the original table: ``_generate_indices`` 7. Recreate constraints from the original table: ``_generate_constraints`` 8. Promote the temp table and delete the original: ``get_go_live_query`` :param table: the upstream table to copy :param progress: multiprocessing.Value float for sharing task progress """ log.info(f"`{table}`: Starting table promotion | _Next: recreate-indices_") downstream_db = database_connect() with downstream_db, downstream_db.cursor() as downstream_cur: # Step 6: Recreate indices from the original table log.info("Recreating database indices...") create_indices, index_mapping = _generate_indices(downstream_db, table) _update_progress(progress, 50.0) if create_indices != "": for create_index in create_indices: log.info(f"Running: {create_index}") downstream_cur.execute(create_index) log.info("Done creating indices! Remapping constraints...") _update_progress(progress, 70.0) # Step 7: Recreate constraints from the original table remap_constraints = _generate_constraints(downstream_db, table) if len(remap_constraints): for remap_constraint in remap_constraints: log.info(f"Running: {remap_constraint.as_string(downstream_cur)}") downstream_cur.execute(remap_constraint) log.info("Done remapping constraints! Going live with new table...") _update_progress(progress, 99.0) slack.verbose( f"`{table}`: Indices & constraints applied, finished refreshing table | " f"_Next: Elasticsearch reindex_" ) # Step 8: Promote the temporary table and delete the original go_live = get_go_live_query(table, index_mapping) log.info(f"Running go-live: \n{go_live.as_string(downstream_cur)}") downstream_cur.execute(go_live) slack.verbose( f"`{table}`: Finished table promotion | " f"_Next: Elasticsearch promotion_" ) downstream_db.close() log.info(f"Finished promoting table {table}") _update_progress(progress, 100.0)
def _clean_data_worker(rows, temp_table, sources_config): log.info("Starting data cleaning worker") global_field_to_func = sources_config["*"]["fields"] worker_conn = database_connect() log.info("Data cleaning worker connected to database") write_cur = worker_conn.cursor(cursor_factory=DictCursor) log.info(f"Cleaning {len(rows)} rows") tls_cache = {} start_time = time.time() for row in rows: # Map fields that need updating to their cleaning functions source = row["source"] _id = row["id"] if source in sources_config: source_field_to_func = sources_config[source]["fields"] # Merge source-local and global function field mappings fields_to_update = {**global_field_to_func, **source_field_to_func} else: fields_to_update = global_field_to_func # Map fields to their cleaned data cleaned_data = {} for update_field in fields_to_update: dirty_value = row[update_field] if not dirty_value: continue cleaning_func = fields_to_update[update_field] if cleaning_func == CleanupFunctions.cleanup_url: clean = cleaning_func(url=dirty_value, tls_support=tls_cache) else: clean = cleaning_func(dirty_value) if clean: cleaned_data[update_field] = clean # Generate SQL update for all the fields we just cleaned update_field_expressions = [] for field in cleaned_data: update_field_expressions.append(f"{field} = {cleaned_data[field]}") if len(update_field_expressions) > 0: update_query = f"""UPDATE {temp_table} SET {', '.join(update_field_expressions)} WHERE id = {_id} """ write_cur.execute(update_query) log.info(f"TLS cache: {tls_cache}") log.info("Worker committing changes...") worker_conn.commit() write_cur.close() worker_conn.close() end_time = time.time() total_time = end_time - start_time log.info(f"Worker finished batch in {total_time}") return True
def clean_image_data(table): """ Data from upstream can be unsuitable for production for a number of reasons. Clean it up before we go live with the new data. :param table: The staging table for the new data :param upstream_db: A dict specifying the connection details of the upstream database. :return: None """ # Map each table to the fields that need to be cleaned up. Then, map each # field to its cleanup function. log.info('Cleaning up data...') start_time = time.time() table_config = _cleanup_config['tables'][table] # Pull data from selected sources only. sources = list(_cleanup_config['tables'][table]['sources']) # Determine which fields will need updating fields_to_clean = set() for p in sources: _fields = list(table_config['sources'][p]['fields']) for f in _fields: fields_to_clean.add(f) cleanup_selection = "SELECT id, source, {fields} from {table}".format( fields=', '.join(fields_to_clean), table='temp_import_{}'.format(table), ) log.info('Running cleanup on selection "{}"'.format(cleanup_selection)) conn = database_connect(autocommit=True) cursor_name = '{}-{}'.format(table, str(uuid.uuid4())) with conn.cursor(name=cursor_name, cursor_factory=DictCursor, withhold=True) as iter_cur: iter_cur.itersize = CLEANUP_BUFFER_SIZE iter_cur.execute(cleanup_selection) # Clean each field as specified in _cleanup_config. source_config = table_config['sources'] log.info('Fetching first batch') batch = iter_cur.fetchmany(size=CLEANUP_BUFFER_SIZE) jobs = [] num_workers = multiprocessing.cpu_count() num_cleaned = 0 while batch: # Divide updates into jobs for parallel execution. batch_start_time = time.time() temp_table = 'temp_import_{}'.format(table) job_size = int(len(batch) / num_workers) last_end = -1 log.info('Dividing work') for n in range(1, num_workers + 1): log.info('Scheduling job {}'.format(n)) start = last_end + 1 end = job_size * n last_end = end # Arguments for parallel _clean_data_worker calls jobs.append((batch[start:end], temp_table, source_config)) pool = multiprocessing.Pool(processes=num_workers) log.info('Starting {} cleaning jobs'.format(len(jobs))) conn.commit() pool.starmap(_clean_data_worker, jobs) pool.close() num_cleaned += len(batch) batch_end_time = time.time() rate = len(batch) / (batch_end_time - batch_start_time) log.info('Batch finished, records/s: cleanup_rate={}'.format(rate)) log.info( 'Fetching next batch. Num records cleaned so far: {}'.format( num_cleaned)) jobs = [] batch = iter_cur.fetchmany(size=CLEANUP_BUFFER_SIZE) conn.commit() iter_cur.close() conn.close() end_time = time.time() cleanup_time = end_time - start_time log.info('Cleaned all records in {} seconds'.format(cleanup_time))
def refresh_api_table( table: str, progress: multiprocessing.Value = None, approach: ApproachType = "advanced", ): """ Import updates from the upstream catalog database into the API. The process involves the following steps. 1. Get the list of overlapping columns: ``_get_shared_cols`` 2. Create the FDW extension if it does not exist 3. Create FDW for the data transfer: ``get_fdw_query`` 4. Import data into a temporary table: ``get_copy_data_query`` 5. Clean the data: ``clean_image_data`` This is the main function of this module. :param table: The upstream table to copy. :param progress: multiprocessing.Value float for sharing task progress :param approach: whether to use advanced logic specific to media ingestion """ # Step 1: Get the list of overlapping columns slack.info(f"`{table}`: Starting data refresh | _Next: copying data from upstream_") downstream_db = database_connect() upstream_db = psycopg2.connect( dbname=UPSTREAM_DB_NAME, user=UPSTREAM_DB_USER, port=UPSTREAM_DB_PORT, password=UPSTREAM_DB_PASSWORD, host=UPSTREAM_DB_HOST, connect_timeout=5, ) shared_cols = _get_shared_cols(downstream_db, upstream_db, table) upstream_db.close() with downstream_db, downstream_db.cursor() as downstream_cur: # Step 2: Create the FDW extension if it does not exist log.info("(Re)initializing foreign data wrapper") try: create_ext = get_create_ext_query() downstream_cur.execute(create_ext) except psycopg2.errors.UniqueViolation: log.warning("Extension already exists, possible race condition.") with downstream_db, downstream_db.cursor() as downstream_cur: # Step 3: Create FDW for the data transfer init_fdw = get_fdw_query( RELATIVE_UPSTREAM_DB_HOST, RELATIVE_UPSTREAM_DB_PORT, UPSTREAM_DB_NAME, UPSTREAM_DB_USER, UPSTREAM_DB_PASSWORD, f"{table}_view", ) downstream_cur.execute(init_fdw) # Step 4: Import data into a temporary table log.info("Copying upstream data...") environment = config("ENVIRONMENT", default="local").lower() limit_default = 100_000 if environment in {"prod", "production"}: # If we're in production, turn off limits unless it's explicitly provided limit_default = 0 limit = config("DATA_REFRESH_LIMIT", cast=int, default=limit_default) copy_data = get_copy_data_query( table, shared_cols, approach=approach, limit=limit ) log.info(f"Running copy-data query: \n{copy_data.as_string(downstream_cur)}") downstream_cur.execute(copy_data) next_step = ( "_Next: {starting data cleaning}_" if table == "image" else "Finished refreshing table" ) slack.verbose(f"`{table}`: Data copy complete | {next_step}") if table == "image": # Step 5: Clean the data log.info("Cleaning data...") clean_image_data(table) log.info("Cleaning completed!") slack.verbose( f"`{table}`: Data cleaning complete | " f"Finished refreshing table" ) downstream_db.close() log.info(f"Finished refreshing table '{table}'.") _update_progress(progress, 100.0)
def reload_upstream(table, progress=None, finish_time=None): """ Import updates from the upstream CC Catalog database into the API. :param table: The upstream table to copy. :param progress: multiprocessing.Value float for sharing task progress :param finish_time: multiprocessing.Value int for sharing finish timestamp :return: """ downstream_db = database_connect() upstream_db = psycopg2.connect( dbname='openledger', user='******', port=UPSTREAM_DB_PORT, password=UPSTREAM_DB_PASSWORD, host=UPSTREAM_DB_HOST, connect_timeout=5 ) query_cols = ','.join(_get_shared_cols(downstream_db, upstream_db, table)) upstream_db.close() # Connect to upstream database and create references to foreign tables. log.info('(Re)initializing foreign data wrapper') init_fdw = ''' CREATE EXTENSION IF NOT EXISTS postgres_fdw; DROP SERVER IF EXISTS upstream CASCADE; CREATE SERVER upstream FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{host}', dbname 'openledger', port '{port}'); CREATE USER MAPPING IF NOT EXISTS FOR deploy SERVER upstream OPTIONS (user 'deploy', password '{passwd}'); DROP SCHEMA IF EXISTS upstream_schema CASCADE; CREATE SCHEMA upstream_schema AUTHORIZATION deploy; IMPORT FOREIGN SCHEMA public LIMIT TO ({table}) FROM SERVER upstream INTO upstream_schema; '''.format(host=UPSTREAM_DB_HOST, passwd=UPSTREAM_DB_PASSWORD, table=table, port=UPSTREAM_DB_PORT) # 1. Import data into a temporary table # 2. Recreate indices from the original table # 3. Recreate constraints from the original table. # 4. Delete orphaned foreign key references. # 5. Clean the data. # 6. Promote the temporary table and delete the original. copy_data = ''' DROP TABLE IF EXISTS temp_import_{table}; CREATE TABLE temp_import_{table} (LIKE {table} INCLUDING CONSTRAINTS); INSERT INTO temp_import_{table} ({cols}) SELECT {cols} from upstream_schema.{table}; ALTER TABLE temp_import_{table} ADD PRIMARY KEY (id); DROP SERVER upstream CASCADE; '''.format(table=table, cols=query_cols) create_indices = ';\n'.join(_generate_indices(downstream_db, table)) remap_constraints = ';\n'.join(_generate_constraints(downstream_db, table)) go_live = ''' DROP TABLE {table}; ALTER TABLE temp_import_{table} RENAME TO {table}; '''.format(table=table) with downstream_db.cursor() as downstream_cur: log.info('Copying upstream data...') downstream_cur.execute(init_fdw) downstream_cur.execute(copy_data) downstream_db.commit() downstream_db.close() clean_data(table) log.info('Cleaning step finished.') downstream_db = database_connect() with downstream_db.cursor() as downstream_cur: log.info('Copying finished! Recreating database indices...') _update_progress(progress, 50.0) if create_indices != '': downstream_cur.execute(create_indices) _update_progress(progress, 70.0) log.info('Done creating indices! Remapping constraints...') if remap_constraints != '': downstream_cur.execute(remap_constraints) _update_progress(progress, 99.0) log.info('Done remapping constraints! Going live with new table...') downstream_cur.execute(go_live) downstream_db.commit() downstream_db.close() log.info('Finished refreshing table \'{}\'.'.format(table)) _update_progress(progress, 100.0) if finish_time: finish_time.value = datetime.datetime.utcnow().timestamp()
def clean_data(table): """ Data from upstream can be unsuitable for production for a number of reasons. Clean it up before we go live with the new data. :param conn: The database connection :param table: The staging table for the new data :return: None """ # Map each table to the fields that need to be cleaned up. Then, map each # field to its cleanup function. log.info('Cleaning up data...') start_time = time.time() table_config = _cleanup_config['tables'][table] # Pull data from selected providers only. providers = list(_cleanup_config['tables'][table]['providers']) provider_equals = "provider = '{}'" all_providers_equal = [provider_equals.format(p) for p in providers] provider_condition = ' OR '.join(all_providers_equal) # Determine whether we should select every provider. if '*' in table_config['providers']: where_clause = '' else: where_clause = 'WHERE ' + provider_condition # Determine which fields will need updating fields_to_clean = set() for p in providers: _fields = list(table_config['providers'][p]['fields']) for f in _fields: fields_to_clean.add(f) cleanup_selection = "SELECT id, provider, {fields} from {table}" \ " {where_clause}".format( fields=', '.join(fields_to_clean), table='temp_import_{}'.format(table), where_clause=where_clause ) log.info('Running cleanup on selection "{}"'.format(cleanup_selection)) conn = database_connect() iter_cur = conn.cursor(cursor_factory=DictCursor) iter_cur.execute(cleanup_selection) # Clean each field as specified in _cleanup_config. provider_config = table_config['providers'] batch = iter_cur.fetchmany(size=DB_BUFFER_SIZE) jobs = [] num_workers = multiprocessing.cpu_count() * 2 while batch: # Divide updates into jobs for parallel execution. temp_table = 'temp_import_{}'.format(table) job_size = int(len(batch) / num_workers) last_end = -1 for n in range(1, num_workers + 1): start = last_end + 1 end = job_size * n last_end = end # Arguments for parallel _clean_data_worker calls jobs.append( (batch[start:end], temp_table, provider_config) ) batch = iter_cur.fetchmany(size=DB_BUFFER_SIZE) pool = multiprocessing.Pool(processes=num_workers) log.info('Starting {} cleaning jobs'.format(len(jobs))) pool.starmap(_clean_data_worker, jobs) pool.close() iter_cur.close() end_time = time.time() cleanup_time = end_time - start_time log.info('Cleaned all records in {} seconds'.format( cleanup_time) )