示例#1
0
def _clean_data_worker(rows, temp_table, providers_config):
    log.info('Starting data cleaning worker')
    global_field_to_func = providers_config['*']['fields']
    worker_conn = database_connect()
    log.info('Data cleaning worker connected to database')
    write_cur = worker_conn.cursor(cursor_factory=DictCursor)
    log.info('Cleaning {} rows'.format(len(rows)))
    tls_cache = {}
    start_time = time.time()
    for row in rows:
        # Map fields that need updating to their cleaning functions
        provider = row['provider']
        _id = row['id']
        if provider in providers_config:
            provider_field_to_func = providers_config[provider]['fields']
            # Merge provider-local and global function field mappings
            fields_to_update = \
                {**global_field_to_func, **provider_field_to_func}
        else:
            fields_to_update = global_field_to_func
        # Map fields to their cleaned data
        cleaned_data = {}
        for update_field in fields_to_update:
            dirty_value = row[update_field]
            if not dirty_value:
                continue
            cleaning_func = fields_to_update[update_field]
            if cleaning_func == CleanupFunctions.cleanup_url:
                clean = cleaning_func(url=dirty_value, tls_support=tls_cache)
            else:
                clean = cleaning_func(dirty_value)
            if clean:
                cleaned_data[update_field] = clean
        # Generate SQL update for all the fields we just cleaned
        update_field_expressions = []
        for field in cleaned_data:
            update_field_expressions.append(
                '{field} = {cleaned}'.format(
                    field=field,
                    cleaned=cleaned_data[field]
                )
            )
        if len(update_field_expressions) > 0:
            update_query = '''
                UPDATE {temp_table} SET {field_expressions} WHERE id = {_id}
            '''.format(
                temp_table=temp_table,
                field_expressions=', '.join(update_field_expressions),
                _id=_id
            )
            write_cur.execute(update_query)
    log.info('TLS cache: {}'.format(tls_cache))
    log.info('Worker committing changes...')
    worker_conn.commit()
    write_cur.close()
    worker_conn.close()
    end_time = time.time()
    total_time = end_time - start_time
    log.info('Worker finished batch in {}'.format(total_time))
    return True
示例#2
0
def promote_api_table(
    table: str,
    progress: multiprocessing.Value = None,
):
    """
    This runs after ``refresh_api_table``. The process involves the following steps.

    6. Recreate indices from the original table: ``_generate_indices``
    7. Recreate constraints from the original table: ``_generate_constraints``
    8. Promote the temp table and delete the original: ``get_go_live_query``

    :param table: the upstream table to copy
    :param progress: multiprocessing.Value float for sharing task progress
    """

    log.info(f"`{table}`: Starting table promotion | _Next: recreate-indices_")
    downstream_db = database_connect()

    with downstream_db, downstream_db.cursor() as downstream_cur:
        # Step 6: Recreate indices from the original table
        log.info("Recreating database indices...")
        create_indices, index_mapping = _generate_indices(downstream_db, table)
        _update_progress(progress, 50.0)
        if create_indices != "":
            for create_index in create_indices:
                log.info(f"Running: {create_index}")
                downstream_cur.execute(create_index)
        log.info("Done creating indices! Remapping constraints...")
        _update_progress(progress, 70.0)

        # Step 7: Recreate constraints from the original table
        remap_constraints = _generate_constraints(downstream_db, table)
        if len(remap_constraints):
            for remap_constraint in remap_constraints:
                log.info(f"Running: {remap_constraint.as_string(downstream_cur)}")
                downstream_cur.execute(remap_constraint)
        log.info("Done remapping constraints! Going live with new table...")
        _update_progress(progress, 99.0)
        slack.verbose(
            f"`{table}`: Indices & constraints applied, finished refreshing table | "
            f"_Next: Elasticsearch reindex_"
        )

        # Step 8: Promote the temporary table and delete the original
        go_live = get_go_live_query(table, index_mapping)
        log.info(f"Running go-live: \n{go_live.as_string(downstream_cur)}")
        downstream_cur.execute(go_live)
        slack.verbose(
            f"`{table}`: Finished table promotion | " f"_Next: Elasticsearch promotion_"
        )

    downstream_db.close()
    log.info(f"Finished promoting table {table}")
    _update_progress(progress, 100.0)
示例#3
0
def _clean_data_worker(rows, temp_table, sources_config):
    log.info("Starting data cleaning worker")
    global_field_to_func = sources_config["*"]["fields"]
    worker_conn = database_connect()
    log.info("Data cleaning worker connected to database")
    write_cur = worker_conn.cursor(cursor_factory=DictCursor)
    log.info(f"Cleaning {len(rows)} rows")
    tls_cache = {}
    start_time = time.time()
    for row in rows:
        # Map fields that need updating to their cleaning functions
        source = row["source"]
        _id = row["id"]
        if source in sources_config:
            source_field_to_func = sources_config[source]["fields"]
            # Merge source-local and global function field mappings
            fields_to_update = {**global_field_to_func, **source_field_to_func}
        else:
            fields_to_update = global_field_to_func
        # Map fields to their cleaned data
        cleaned_data = {}
        for update_field in fields_to_update:
            dirty_value = row[update_field]
            if not dirty_value:
                continue
            cleaning_func = fields_to_update[update_field]
            if cleaning_func == CleanupFunctions.cleanup_url:
                clean = cleaning_func(url=dirty_value, tls_support=tls_cache)
            else:
                clean = cleaning_func(dirty_value)
            if clean:
                cleaned_data[update_field] = clean
        # Generate SQL update for all the fields we just cleaned
        update_field_expressions = []
        for field in cleaned_data:
            update_field_expressions.append(f"{field} = {cleaned_data[field]}")
        if len(update_field_expressions) > 0:
            update_query = f"""UPDATE {temp_table} SET
            {', '.join(update_field_expressions)} WHERE id = {_id}
            """
            write_cur.execute(update_query)
    log.info(f"TLS cache: {tls_cache}")
    log.info("Worker committing changes...")
    worker_conn.commit()
    write_cur.close()
    worker_conn.close()
    end_time = time.time()
    total_time = end_time - start_time
    log.info(f"Worker finished batch in {total_time}")
    return True
示例#4
0
def clean_image_data(table):
    """
    Data from upstream can be unsuitable for production for a number of reasons.
    Clean it up before we go live with the new data.

    :param table: The staging table for the new data
    :param upstream_db: A dict specifying the connection details of the upstream
    database.
    :return: None
    """
    # Map each table to the fields that need to be cleaned up. Then, map each
    # field to its cleanup function.
    log.info('Cleaning up data...')
    start_time = time.time()
    table_config = _cleanup_config['tables'][table]

    # Pull data from selected sources only.
    sources = list(_cleanup_config['tables'][table]['sources'])

    # Determine which fields will need updating
    fields_to_clean = set()
    for p in sources:
        _fields = list(table_config['sources'][p]['fields'])
        for f in _fields:
            fields_to_clean.add(f)

    cleanup_selection = "SELECT id, source, {fields} from {table}".format(
        fields=', '.join(fields_to_clean),
        table='temp_import_{}'.format(table),
    )
    log.info('Running cleanup on selection "{}"'.format(cleanup_selection))
    conn = database_connect(autocommit=True)
    cursor_name = '{}-{}'.format(table, str(uuid.uuid4()))
    with conn.cursor(name=cursor_name,
                     cursor_factory=DictCursor,
                     withhold=True) as iter_cur:
        iter_cur.itersize = CLEANUP_BUFFER_SIZE
        iter_cur.execute(cleanup_selection)

        # Clean each field as specified in _cleanup_config.
        source_config = table_config['sources']

        log.info('Fetching first batch')
        batch = iter_cur.fetchmany(size=CLEANUP_BUFFER_SIZE)
        jobs = []
        num_workers = multiprocessing.cpu_count()
        num_cleaned = 0
        while batch:
            # Divide updates into jobs for parallel execution.
            batch_start_time = time.time()
            temp_table = 'temp_import_{}'.format(table)
            job_size = int(len(batch) / num_workers)
            last_end = -1
            log.info('Dividing work')
            for n in range(1, num_workers + 1):
                log.info('Scheduling job {}'.format(n))
                start = last_end + 1
                end = job_size * n
                last_end = end
                # Arguments for parallel _clean_data_worker calls
                jobs.append((batch[start:end], temp_table, source_config))
            pool = multiprocessing.Pool(processes=num_workers)
            log.info('Starting {} cleaning jobs'.format(len(jobs)))
            conn.commit()
            pool.starmap(_clean_data_worker, jobs)
            pool.close()
            num_cleaned += len(batch)
            batch_end_time = time.time()
            rate = len(batch) / (batch_end_time - batch_start_time)
            log.info('Batch finished, records/s: cleanup_rate={}'.format(rate))
            log.info(
                'Fetching next batch. Num records cleaned so far: {}'.format(
                    num_cleaned))
            jobs = []
            batch = iter_cur.fetchmany(size=CLEANUP_BUFFER_SIZE)
    conn.commit()
    iter_cur.close()
    conn.close()
    end_time = time.time()
    cleanup_time = end_time - start_time
    log.info('Cleaned all records in {} seconds'.format(cleanup_time))
示例#5
0
def refresh_api_table(
    table: str,
    progress: multiprocessing.Value = None,
    approach: ApproachType = "advanced",
):
    """
    Import updates from the upstream catalog database into the API. The
    process involves the following steps.

    1. Get the list of overlapping columns: ``_get_shared_cols``
    2. Create the FDW extension if it does not exist
    3. Create FDW for the data transfer: ``get_fdw_query``
    4. Import data into a temporary table: ``get_copy_data_query``
    5. Clean the data: ``clean_image_data``

    This is the main function of this module.

    :param table: The upstream table to copy.
    :param progress: multiprocessing.Value float for sharing task progress
    :param approach: whether to use advanced logic specific to media ingestion
    """

    # Step 1: Get the list of overlapping columns
    slack.info(f"`{table}`: Starting data refresh | _Next: copying data from upstream_")
    downstream_db = database_connect()
    upstream_db = psycopg2.connect(
        dbname=UPSTREAM_DB_NAME,
        user=UPSTREAM_DB_USER,
        port=UPSTREAM_DB_PORT,
        password=UPSTREAM_DB_PASSWORD,
        host=UPSTREAM_DB_HOST,
        connect_timeout=5,
    )
    shared_cols = _get_shared_cols(downstream_db, upstream_db, table)
    upstream_db.close()

    with downstream_db, downstream_db.cursor() as downstream_cur:
        # Step 2: Create the FDW extension if it does not exist
        log.info("(Re)initializing foreign data wrapper")
        try:
            create_ext = get_create_ext_query()
            downstream_cur.execute(create_ext)
        except psycopg2.errors.UniqueViolation:
            log.warning("Extension already exists, possible race condition.")

    with downstream_db, downstream_db.cursor() as downstream_cur:
        # Step 3: Create FDW for the data transfer
        init_fdw = get_fdw_query(
            RELATIVE_UPSTREAM_DB_HOST,
            RELATIVE_UPSTREAM_DB_PORT,
            UPSTREAM_DB_NAME,
            UPSTREAM_DB_USER,
            UPSTREAM_DB_PASSWORD,
            f"{table}_view",
        )
        downstream_cur.execute(init_fdw)

        # Step 4: Import data into a temporary table
        log.info("Copying upstream data...")
        environment = config("ENVIRONMENT", default="local").lower()
        limit_default = 100_000
        if environment in {"prod", "production"}:
            # If we're in production, turn off limits unless it's explicitly provided
            limit_default = 0
        limit = config("DATA_REFRESH_LIMIT", cast=int, default=limit_default)
        copy_data = get_copy_data_query(
            table, shared_cols, approach=approach, limit=limit
        )
        log.info(f"Running copy-data query: \n{copy_data.as_string(downstream_cur)}")
        downstream_cur.execute(copy_data)

    next_step = (
        "_Next: {starting data cleaning}_"
        if table == "image"
        else "Finished refreshing table"
    )
    slack.verbose(f"`{table}`: Data copy complete | {next_step}")

    if table == "image":
        # Step 5: Clean the data
        log.info("Cleaning data...")
        clean_image_data(table)
        log.info("Cleaning completed!")
        slack.verbose(
            f"`{table}`: Data cleaning complete | " f"Finished refreshing table"
        )

    downstream_db.close()
    log.info(f"Finished refreshing table '{table}'.")
    _update_progress(progress, 100.0)
示例#6
0
def reload_upstream(table, progress=None, finish_time=None):
    """
    Import updates from the upstream CC Catalog database into the API.

    :param table: The upstream table to copy.
    :param progress: multiprocessing.Value float for sharing task progress
    :param finish_time: multiprocessing.Value int for sharing finish timestamp
    :return:
    """
    downstream_db = database_connect()
    upstream_db = psycopg2.connect(
        dbname='openledger',
        user='******',
        port=UPSTREAM_DB_PORT,
        password=UPSTREAM_DB_PASSWORD,
        host=UPSTREAM_DB_HOST,
        connect_timeout=5
    )
    query_cols = ','.join(_get_shared_cols(downstream_db, upstream_db, table))
    upstream_db.close()
    # Connect to upstream database and create references to foreign tables.
    log.info('(Re)initializing foreign data wrapper')
    init_fdw = '''
        CREATE EXTENSION IF NOT EXISTS postgres_fdw;
        DROP SERVER IF EXISTS upstream CASCADE;
        CREATE SERVER upstream FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (host '{host}', dbname 'openledger', port '{port}');

        CREATE USER MAPPING IF NOT EXISTS FOR deploy SERVER upstream
        OPTIONS (user 'deploy', password '{passwd}');
        DROP SCHEMA IF EXISTS upstream_schema CASCADE;
        CREATE SCHEMA upstream_schema AUTHORIZATION deploy;

        IMPORT FOREIGN SCHEMA public
        LIMIT TO ({table}) FROM SERVER upstream INTO upstream_schema;
    '''.format(host=UPSTREAM_DB_HOST, passwd=UPSTREAM_DB_PASSWORD, table=table,
               port=UPSTREAM_DB_PORT)
    # 1. Import data into a temporary table
    # 2. Recreate indices from the original table
    # 3. Recreate constraints from the original table.
    # 4. Delete orphaned foreign key references.
    # 5. Clean the data.
    # 6. Promote the temporary table and delete the original.
    copy_data = '''
        DROP TABLE IF EXISTS temp_import_{table};
        CREATE TABLE temp_import_{table} (LIKE {table} INCLUDING CONSTRAINTS);
        INSERT INTO temp_import_{table} ({cols})
        SELECT {cols} from upstream_schema.{table};
        ALTER TABLE temp_import_{table} ADD PRIMARY KEY (id);
        DROP SERVER upstream CASCADE;
    '''.format(table=table, cols=query_cols)
    create_indices = ';\n'.join(_generate_indices(downstream_db, table))
    remap_constraints = ';\n'.join(_generate_constraints(downstream_db, table))
    go_live = '''
        DROP TABLE {table};
        ALTER TABLE temp_import_{table} RENAME TO {table};
    '''.format(table=table)

    with downstream_db.cursor() as downstream_cur:
        log.info('Copying upstream data...')
        downstream_cur.execute(init_fdw)
        downstream_cur.execute(copy_data)
    downstream_db.commit()
    downstream_db.close()
    clean_data(table)
    log.info('Cleaning step finished.')
    downstream_db = database_connect()
    with downstream_db.cursor() as downstream_cur:
        log.info('Copying finished! Recreating database indices...')
        _update_progress(progress, 50.0)
        if create_indices != '':
            downstream_cur.execute(create_indices)
        _update_progress(progress, 70.0)
        log.info('Done creating indices! Remapping constraints...')
        if remap_constraints != '':
            downstream_cur.execute(remap_constraints)
        _update_progress(progress, 99.0)
        log.info('Done remapping constraints! Going live with new table...')
        downstream_cur.execute(go_live)
    downstream_db.commit()
    downstream_db.close()
    log.info('Finished refreshing table \'{}\'.'.format(table))
    _update_progress(progress, 100.0)
    if finish_time:
        finish_time.value = datetime.datetime.utcnow().timestamp()
示例#7
0
def clean_data(table):
    """
    Data from upstream can be unsuitable for production for a number of reasons.
    Clean it up before we go live with the new data.

    :param conn: The database connection
    :param table: The staging table for the new data
    :return: None
    """
    # Map each table to the fields that need to be cleaned up. Then, map each
    # field to its cleanup function.
    log.info('Cleaning up data...')
    start_time = time.time()
    table_config = _cleanup_config['tables'][table]

    # Pull data from selected providers only.
    providers = list(_cleanup_config['tables'][table]['providers'])
    provider_equals = "provider = '{}'"
    all_providers_equal = [provider_equals.format(p) for p in providers]
    provider_condition = ' OR '.join(all_providers_equal)
    # Determine whether we should select every provider.
    if '*' in table_config['providers']:
        where_clause = ''
    else:
        where_clause = 'WHERE ' + provider_condition

    # Determine which fields will need updating
    fields_to_clean = set()
    for p in providers:
        _fields = list(table_config['providers'][p]['fields'])
        for f in _fields:
            fields_to_clean.add(f)

    cleanup_selection = "SELECT id, provider, {fields} from {table}" \
                        " {where_clause}".format(
                            fields=', '.join(fields_to_clean),
                            table='temp_import_{}'.format(table),
                            where_clause=where_clause
                        )
    log.info('Running cleanup on selection "{}"'.format(cleanup_selection))
    conn = database_connect()
    iter_cur = conn.cursor(cursor_factory=DictCursor)
    iter_cur.execute(cleanup_selection)

    # Clean each field as specified in _cleanup_config.
    provider_config = table_config['providers']

    batch = iter_cur.fetchmany(size=DB_BUFFER_SIZE)
    jobs = []
    num_workers = multiprocessing.cpu_count() * 2
    while batch:
        # Divide updates into jobs for parallel execution.
        temp_table = 'temp_import_{}'.format(table)
        job_size = int(len(batch) / num_workers)
        last_end = -1
        for n in range(1, num_workers + 1):
            start = last_end + 1
            end = job_size * n
            last_end = end
            # Arguments for parallel _clean_data_worker calls
            jobs.append(
                (batch[start:end], temp_table, provider_config)
            )
        batch = iter_cur.fetchmany(size=DB_BUFFER_SIZE)
    pool = multiprocessing.Pool(processes=num_workers)
    log.info('Starting {} cleaning jobs'.format(len(jobs)))
    pool.starmap(_clean_data_worker, jobs)
    pool.close()
    iter_cur.close()
    end_time = time.time()
    cleanup_time = end_time - start_time
    log.info('Cleaned all records in {} seconds'.format(
        cleanup_time)
    )