Exemplo n.º 1
0
    def test_good_file(self, request_mock):
        with full_rig_with_s3() as (app, engine):
            upload_id = self.do_upload(app, request_mock)
            # okay, here's what we really want to test.
            # call the merge endpoint
            response = app.post(
                '/api/upload/merge_file?uploadId={}'.format(upload_id))
            response_data = json.loads(response.get_data().decode('utf-8'))
            assert response_data['status'] == 'success'
            # make sure that there is a new merged file on s3
            expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/merged'
            with open_sesame(expected_s3_path, 'rb') as expected_s3_file:
                reader = csv.reader(expected_s3_file)
                assert len([row for row in reader]) == ROWS_IN_GOOD_HMIS_FILE

            # and make sure that the merge log has a record of this
            assert db_session.query(MergeLog).filter(
                MergeLog.upload_id == '123-456').one

            # make sure that the master table has been bootstrapped
            master_table = generate_master_table_name('boone',
                                                      'hmis_service_stays')
            total_rows = db_session.query(
                'count(*) from {}'.format(master_table)).one()
            assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )

            # make sure that we filled in some matched ids
            total_rows = db_session.query(
                'count(matched_id is not null) from {}'.format(
                    master_table)).one()
            assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )
Exemplo n.º 2
0
    def test_write_matches_to_db(self):
        BOOTSTRAPPED_HMIS_FILE = 'sample_data/matched/bootstrapped_hmis_data_20180401.csv'
        MATCHES_HMIS_FILE = 'sample_data/matched/matchesonly_hmis_data.csv'

        with testing.postgresql.Postgresql() as postgresql:
            db_engine = create_engine(postgresql.url())
            create_and_populate_master_table(table_name='hmis_service_stays',
                                             db_engine=db_engine,
                                             file_path=BOOTSTRAPPED_HMIS_FILE)
            # generate expected matches by taking the first column of the matched id spreadsheet
            with open(MATCHES_HMIS_FILE, 'rb') as f:
                reader = csv.reader(f, delimiter='|')
                next(reader)
                expected_matched_ids = set(
                    re.sub(r'\W+', '', row[0]) for row in reader)

            # write these matches to the DB
            with open(MATCHES_HMIS_FILE, 'rb') as fh:
                write_matches_to_db(db_engine=db_engine,
                                    event_type='hmis_service_stays',
                                    jurisdiction='boone',
                                    matches_filehandle=fh)

            full_table_name = generate_master_table_name(
                'boone', 'hmis_service_stays')
            retrieved_matched_ids = set([
                row[0] for row in db_engine.execute(
                    'select distinct matched_id from {}'.format(
                        full_table_name))
            ])
            assert retrieved_matched_ids == expected_matched_ids
Exemplo n.º 3
0
    def test_update_overlapping(self):
        with testing.postgresql.Postgresql() as postgresql:
            engine = create_engine(postgresql.url())
            Base.metadata.create_all(engine)
            db_session = scoped_session(
                sessionmaker(autocommit=False, autoflush=False, bind=engine))
            self.populate_seed_data(db_session)
            # merge in a table with one new row and one row that has the same
            # primary key but a new name: hopefully we won't be seeing much like
            # this in production but we do want to be sure that we use the new
            # data in these cases (for instance, data corrections)
            OLD = '456780'
            UPDATED = '456789'
            NEW = '456791'
            raw_table_name = '234-567'
            new_data = [
                [
                    u'123458', UPDATED, 'A345', 'Jack F. Ripper', '1896-04-10',
                    '345-45-6789'
                ],
                [
                    u'123459', NEW, 'A345', 'Jack R. Ripper', '1896-04-10',
                    '345-45-6780'
                ],
            ]
            create_and_populate_raw_table('234-567', new_data, db_session.bind)
            upsert_raw_table_to_master(raw_table_name, self.jurisdiction,
                                       self.event_type, '234-567', db_session)
            master_table_name = generate_master_table_name(
                self.jurisdiction, self.event_type)
            result = self.get_pks_and_names(master_table_name, db_session)
            assert len(result) == 4
            # the duplicated event id should only be present once
            assert result == [
                ('456780', 'A345', 'Jack L. Ripper'),
                ('456780', 'A346', 'Jack L. Ripper'),
                ('456789', 'A345', 'Jack F. Ripper'),
                ('456791', 'A345', 'Jack R. Ripper'),
            ]
            # let's check the timestamps
            timestamp_result = dict(
                (row[0], (row[1], row[2])) for row in db_session.execute('''
                select "internal_event_id", "inserted_ts", "updated_ts" from {}
                '''.format(master_table_name)))
            # created timestamp of an old row that was not updated should be the same as an old row that was updated
            assert timestamp_result[OLD][0] == timestamp_result[UPDATED][0]
            # updated timestamp of an old row that was not updated should be lower than an old row that was updated
            assert timestamp_result[OLD][1] < timestamp_result[UPDATED][1]
            # created and updated timestamps of totally new rows should be equivalent
            assert timestamp_result[NEW][0] == timestamp_result[NEW][1]

            merge_logs = db_session.query(MergeLog).all()
            assert len(merge_logs) == 2
            assert merge_logs[0].upload_id == '123-456'
            assert merge_logs[1].upload_id == '234-567'
            assert merge_logs[0].total_unique_rows == 3
            assert merge_logs[1].total_unique_rows == 2
            assert merge_logs[0].new_unique_rows == 3
            assert merge_logs[1].new_unique_rows == 1
Exemplo n.º 4
0
def sync_merged_file_to_storage(jurisdiction, event_type, db_engine):
    full_path = merged_file_path(jurisdiction, event_type)
    table_name = generate_master_table_name(jurisdiction, event_type)
    with open_sesame(full_path, 'wb') as outfile:
        cursor = db_engine.raw_connection().cursor()
        copy_stmt = 'copy "{}" to stdout with csv header delimiter as \'|\''.format(
            table_name)
        cursor.copy_expert(copy_stmt, outfile)
Exemplo n.º 5
0
def source_data_to_filehandle(jurisdiction, event_type):
    master_table = generate_master_table_name(jurisdiction, event_type)
    out_filehandle = io.BytesIO()
    cursor = db.engine.raw_connection().cursor()
    copy_stmt = 'copy {} to stdout with csv header delimiter as \',\''.format(
        master_table)
    cursor.copy_expert(copy_stmt, out_filehandle)
    out_filehandle.seek(0)
    return out_filehandle
Exemplo n.º 6
0
def create_merged_table(jurisdiction, event_type, db_session):
    master_table_name = generate_master_table_name(jurisdiction, event_type)
    goodtables_schema = load_schema_file(event_type)
    full_column_list = master_table_column_list(goodtables_schema)
    create = create_statement_from_column_list(full_column_list,
                                               master_table_name,
                                               goodtables_schema['primaryKey'])
    # create table if it does not exist
    logging.info('Assembled create-if-not-exists table statement: %s', create)
    db_session.execute(create)
Exemplo n.º 7
0
def create_and_populate_merged_table(table_name, data, db_engine):
    schema = load_schema_file('test')
    n_fields = len(schema['fields'])
    for row in data:
        assert len(row) == n_fields, "sample merged data must have same # of fields as test schema"
    placeholder_string = ', '.join(['%s'] * n_fields)
    master_table_name = generate_master_table_name('test', 'test')
    column_list = master_table_column_list(schema)
    create = create_statement_from_column_list(column_list)
    db_engine.execute(create)
    for row in data:
        db_engine.execute('insert into "{}" values ({}, now(), now())'.format(master_table_name, placeholder_string), *row)
Exemplo n.º 8
0
def upsert_raw_table_to_master(raw_table_name, jurisdiction, event_type,
                               upload_id, db_session):
    create_merged_table(jurisdiction, event_type, db_session)
    master_table_name = generate_master_table_name(jurisdiction, event_type)
    goodtables_schema = load_schema_file(event_type)
    base_column_list = column_list_from_goodtables_schema(goodtables_schema)
    # use new postgres 'on conflict' functionality to upsert
    update_statements = [
        ' "{column}" = EXCLUDED."{column}"'.format(column=column_def[0])
        for column_def in base_column_list
    ]
    start_ts = datetime.today()
    insert_sql = '''
        insert into {master}
        select raw.*, '{new_ts}' inserted_ts, '{new_ts}' updated_ts, row_number() over ()::text || '{event_type}' as matched_id
        from "{raw}" as raw
        on conflict ({primary_key})
        do update set {update_string}, updated_ts = '{new_ts}'
    '''.format(raw=raw_table_name,
               master=master_table_name,
               event_type=event_type,
               primary_key=', '.join([
                   "\"{}\"".format(col)
                   for col in goodtables_schema['primaryKey']
               ]),
               update_string=', '.join(update_statements),
               new_ts=start_ts.isoformat())
    logging.info('Executing insert: %s', insert_sql)
    db_session.execute(insert_sql)
    end_ts = datetime.today()
    merge_log = MergeLog(
        upload_id=upload_id,
        total_unique_rows=total_unique_rows(raw_table_name,
                                            goodtables_schema['primaryKey'],
                                            db_session),
        new_unique_rows=new_unique_rows(master_table_name, start_ts,
                                        db_session),
        merge_start_timestamp=start_ts,
        merge_complete_timestamp=end_ts,
    )
    db_session.add(merge_log)
    db_session.execute('drop table "{}"'.format(raw_table_name))
    db_session.commit()
    return merge_log.id
Exemplo n.º 9
0
def create_and_populate_master_table(table_name, db_engine, file_path=None):
    full_table_name = generate_master_table_name('boone', table_name)
    create_table_query = f"""
        DROP TABLE IF EXISTS {full_table_name};
        CREATE TABLE {full_table_name} ({DATA_FIELDS[table_name]})"""
    db_engine.execute(create_table_query)
    if file_path:
        df = pd.read_csv(file_path)
        df['internal_event_id'] = df['internal_event_id'].apply(str)
        df['matched_id'] = df['matched_id'].apply(str)
        if table_name == "jail_bookings":
            df['booking_number'] = df['booking_number'].apply(lambda x: str(x) if x else None)
            df['jail_entry_date'] = pd.to_datetime(df['jail_entry_date'])
            df['jail_exit_date'] = pd.to_datetime(df['jail_exit_date'])
            df.to_sql(full_table_name, db_engine, if_exists='append', index=False)
            db_engine.execute("update {} set booking_number = null where booking_number = 'nan'".format(full_table_name))
        elif table_name == "hmis_service_stays":
            df['client_location_start_date'] = pd.to_datetime(df['client_location_start_date'])
            df['client_location_end_date'] = pd.to_datetime(df['client_location_end_date'])
            df.to_sql(full_table_name, db_engine, if_exists='append', index=False)
Exemplo n.º 10
0
 def test_update_nonoverlapping(self):
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         Base.metadata.create_all(engine)
         db_session = scoped_session(
             sessionmaker(autocommit=False, autoflush=False, bind=engine))
         # do initial insert, representing the first time data is uploaded
         self.populate_seed_data(db_session)
         raw_table_name = '234-567'
         new_data = [
             [
                 u'123458', u'456790', 'A345', 'Jack F. Ripper',
                 '1896-04-10', '345-45-6789'
             ],
             [
                 u'123459', u'456791', 'A345', 'Jack R. Ripper',
                 '1896-04-10', '345-45-6780'
             ],
         ]
         create_and_populate_raw_table('234-567', new_data, db_session.bind)
         upsert_raw_table_to_master(raw_table_name, self.jurisdiction,
                                    self.event_type, '234-567', db_session)
         master_table_name = generate_master_table_name(
             self.jurisdiction, self.event_type)
         result = self.get_pks_and_names(master_table_name, db_session)
         assert len(result) == 5
         assert result == [
             ('456780', 'A345', 'Jack L. Ripper'),
             ('456780', 'A346', 'Jack L. Ripper'),
             ('456789', 'A345', 'Jack T. Ripper'),
             ('456790', 'A345', 'Jack F. Ripper'),
             ('456791', 'A345', 'Jack R. Ripper'),
         ]
         merge_logs = db_session.query(MergeLog).all()
         assert len(merge_logs) == 2
         assert merge_logs[0].upload_id == '123-456'
         assert merge_logs[1].upload_id == '234-567'
         assert merge_logs[0].total_unique_rows == 3
         assert merge_logs[1].total_unique_rows == 2
         assert merge_logs[0].new_unique_rows == 3
         assert merge_logs[1].new_unique_rows == 2
Exemplo n.º 11
0
def expunge(jurisdiction, event_type):
    """Expunges all data for a given jurisdiction and event type"""
    base_path = app_config['base_path'].format(jurisdiction=jurisdiction, event_type=event_type)
    match_cache_path = app_config['match_cache_path'].format(jurisdiction=jurisdiction)
    master_table = generate_master_table_name(jurisdiction, event_type)

    msg = 'Expunge all data for jurisdiction {} and event_type {}?'.format(jurisdiction, event_type)
    msg += '\n\nstorage paths (recursive!): {}, {}'.format(base_path, match_cache_path)
    msg += '\n\ndatabase tables: {}'.format(master_table)
    msg += '\n\nYou should not do this while the system is validating or matching.'
    msg += '\n\nRemove all this?'
    shall = input("%s (y/N) " % msg).lower() == 'y'
    click.echo(shall)
    if not shall:
        click.echo("Expunge aborted")

    click.echo("Truncating master database table " + master_table)
    engine.execute('truncate {}'.format(master_table))
    click.echo("Removing base path " + base_path)
    remove_recursively(base_path)
    click.echo("Removing match cache path" + match_cache_path)
    remove_recursively(match_cache_path)
Exemplo n.º 12
0
 def test_new_table(self):
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         Base.metadata.create_all(engine)
         db_session = scoped_session(
             sessionmaker(autocommit=False, autoflush=False, bind=engine))
         # do initial insert, representing the first time data is uploaded
         self.populate_seed_data(db_session)
         master_table_name = generate_master_table_name(
             self.jurisdiction, self.event_type)
         result = self.get_pks_and_names(master_table_name, db_session)
         assert len(result) == 3
         assert result == [
             ('456780', 'A345', 'Jack L. Ripper'),
             ('456780', 'A346', 'Jack L. Ripper'),
             ('456789', 'A345', 'Jack T. Ripper'),
         ]
         merge_logs = db_session.query(MergeLog).all()
         assert len(merge_logs) == 1
         assert merge_logs[0].upload_id == '123-456'
         assert merge_logs[0].total_unique_rows == 3
         assert merge_logs[0].new_unique_rows == 3
Exemplo n.º 13
0
def get_records_by_time(start_time, end_time, jurisdiction, limit, offset,
                        order_column, order, set_status):
    hmis_table = generate_master_table_name(jurisdiction, 'hmis_service_stays')
    bookings_table = generate_master_table_name(jurisdiction, 'jail_bookings')
    hmis_exists = table_exists(hmis_table, db.engine)
    bookings_exists = table_exists(bookings_table, db.engine)
    if not hmis_exists:
        raise ValueError(
            'HMIS table {} does not exist. Please try again later.'.format(
                hmis_table))
    if not bookings_exists:
        raise ValueError(
            'Bookings table {} does not exist. Please try again later.'.format(
                bookings_table))
    columns = [
        ("matched_id::text", 'matched_id'),
        ("coalesce(hmis_summary.first_name, jail_summary.first_name)",
         'first_name'),
        ("coalesce(hmis_summary.last_name, jail_summary.last_name)",
         'last_name'),
        ("hmis_summary.hmis_id", 'hmis_id'),
        ("hmis_summary.hmis_contact", 'hmis_contact'),
        ("hmis_summary.last_hmis_contact", 'last_hmis_contact'),
        ("hmis_summary.cumu_hmis_days", 'cumu_hmis_days'),
        ("jail_summary.jail_id", 'jail_id'),
        ("jail_summary.jail_contact", 'jail_contact'),
        ("jail_summary.last_jail_contact", 'last_jail_contact'),
        ("jail_summary.cumu_jail_days", 'cumu_jail_days'),
        ("jail_summary.percent_bookings_homeless_flag",
         "percent_bookings_homeless_flag"),
        ("coalesce(hmis_summary.hmis_contact, 0) + coalesce(jail_summary.jail_contact, 0)",
         'total_contact'),
    ]
    if not any(alias == order_column for expression, alias in columns):
        raise ValueError(
            'Given order column expression does not match any alias in query. Exiting to avoid SQL injection attacks'
        )

    base_query = """WITH booking_duration_lookup AS (
        select coalesce(nullif(booking_number, ''), internal_event_id) as booking_id,
            max(coalesce(location_date::text, jail_entry_date)) as most_recent_location_date,
            max(case when jail_exit_date is not null
            then date_part('day', jail_exit_date::timestamp - jail_entry_date::timestamp)::int \
            else date_part('day', updated_ts::timestamp - jail_entry_date::timestamp)::int
            end) as length_of_stay,
            bool_or(coalesce(homeless, 'N') = 'Y') as any_homeless
        FROM (
            SELECT
               *
            FROM {booking_table}
            WHERE
                not (jail_entry_date < %(start_date)s AND jail_exit_date < %(start_date)s) and
                not (jail_entry_date > %(end_date)s AND jail_exit_date > %(end_date)s)
        ) AS jail
        group by 1
    ), hmis_summary AS (
        SELECT
            matched_id::text,
            string_agg(distinct internal_person_id::text, ';') as hmis_id,
            sum(
                case when client_location_end_date is not null
                    then date_part('day', client_location_end_date::timestamp - client_location_start_date::timestamp) \
                    else date_part('day', updated_ts::timestamp - client_location_start_date::timestamp)
                end
            )::int as cumu_hmis_days,
            count(*) AS hmis_contact,
            to_char(max(client_location_start_date::timestamp), 'YYYY-MM-DD') as last_hmis_contact,
            max(first_name) as first_name,
            max(last_name) as last_name
        FROM (
            SELECT
               *
            FROM {hmis_table}
            WHERE
                not (client_location_start_date < %(start_date)s AND client_location_end_date < %(start_date)s) and
                not (client_location_start_date > %(end_date)s AND client_location_end_date > %(end_date)s)
        ) AS hmis
        GROUP BY matched_id
    ), jail_summary AS (
        SELECT
            matched_id::text,
            string_agg(distinct coalesce(nullif(internal_person_id, ''), inmate_number)::text, ';') as jail_id,
            sum(jail.length_of_stay) as cumu_jail_days,
            count(distinct(coalesce(nullif(booking_number, ''), internal_event_id))) AS jail_contact,
            to_char(max(jail_entry_date::timestamp), 'YYYY-MM-DD') as last_jail_contact,
            max(first_name) as first_name,
            max(last_name) as last_name,
            round(100 * count(case when any_homeless then 1 else null end) / count(*)::float)::text || '%%' as percent_bookings_homeless_flag
        FROM (
            SELECT
               *
            FROM {booking_table}
            join booking_duration_lookup bdl on (
                bdl.booking_id = coalesce(nullif(booking_number, ''), internal_event_id)
                and bdl.most_recent_location_date = coalesce(location_date::text, jail_entry_date)
            )
            WHERE
                not (jail_entry_date < %(start_date)s AND jail_exit_date < %(start_date)s) and
                not (jail_entry_date > %(end_date)s AND jail_exit_date > %(end_date)s)
        ) AS jail
        GROUP BY matched_id
    )
    SELECT
    {columns}
    FROM hmis_summary
    FULL OUTER JOIN jail_summary USING(matched_id)
    """.format(
        hmis_table=hmis_table,
        booking_table=bookings_table,
        columns=",\n".join("{} as {}".format(expression, alias)
                           for expression, alias in columns),
    )

    logging.info('Querying table records')
    if order not in {'asc', 'desc'}:
        raise ValueError(
            'Given order direction is not valid. Exiting to avoid SQL injection attacks'
        )
    if not isinstance(limit, int) and not limit.isdigit() and limit != 'ALL':
        raise ValueError(
            'Given limit is not valid. Existing to avoid SQL injection attacks'
        )
    filter_by_status = {
        'Jail': 'jail_summary.matched_id is not null',
        'HMIS': 'hmis_summary.matched_id is not null',
        'Intersection': 'hmis_summary.matched_id = jail_summary.matched_id'
    }
    status_filter = filter_by_status.get(set_status, 'true')
    rows_to_show = [
        dict(row) for row in db.engine.execute(
            """
        {}
        where {}
        order by {} {} nulls last
        limit {} offset %(offset)s""".format(base_query, status_filter,
                                             order_column, order, limit),
            start_date=start_time,
            end_date=end_time,
            offset=offset,
        )
    ]

    query = """
    SELECT
    *,
    DATE_PART('day', {exit}::timestamp - {start}::timestamp) as days
    FROM {table_name}
    WHERE
        not ({start} < %(start_time)s AND {exit} < %(start_time)s) and
        not ({start} > %(end_time)s AND {exit} > %(end_time)s)
    """
    hmis_query = query.format(table_name=hmis_table,
                              start="client_location_start_date",
                              exit="client_location_end_date")
    bookings_query = query.format(table_name=bookings_table,
                                  start="jail_entry_date",
                                  exit="jail_exit_date")
    logging.info('Done querying table records')
    logging.info('Querying venn diagram stats')
    venn_diagram_stats = next(
        db.engine.execute('''select
        count(distinct(hmis.matched_id)) as hmis_size,
        count(distinct(bookings.matched_id)) as bookings_size,
        count(distinct(case when hmis.matched_id = bookings.matched_id then hmis.matched_id else null end)) as shared_size,
        count(distinct(matched_id))
        from ({}) hmis
        full outer join ({}) bookings using (matched_id)
    '''.format(hmis_query, bookings_query),
                          start_time=start_time,
                          end_time=end_time))
    counts_by_status = {
        'HMIS': venn_diagram_stats[0],
        'Jail': venn_diagram_stats[1],
        'Intersection': venn_diagram_stats[2]
    }

    logging.info('Done querying venn diagram stats')

    venn_diagram_data = [{
        "sets": ["Jail"],
        "size": venn_diagram_stats[1]
    }, {
        "sets": ["Homeless"],
        "size": venn_diagram_stats[0]
    }, {
        "sets": ["Jail", "Homeless"],
        "size": venn_diagram_stats[2]
    }]
    logging.info('Retrieving bar data from database')
    filtered_data = retrieve_bar_data(hmis_table, bookings_table, start_time,
                                      end_time)
    logging.info('Done retrieving bar data from database')
    filtered_data['tableData'] = rows_to_show
    return {
        "vennDiagramData": venn_diagram_data,
        "totalTableRows": counts_by_status.get(set_status,
                                               venn_diagram_stats[3]),
        "filteredData": filtered_data
    }
Exemplo n.º 14
0
def write_matches_to_db(db_engine, event_type, jurisdiction,
                        matches_filehandle):
    goodtables_schema = load_schema_file(event_type)
    table_name = generate_master_table_name(event_type=event_type,
                                            jurisdiction=jurisdiction)
    logging.info('Writing matches for %s / %s to table %s', event_type,
                 jurisdiction, table_name)
    reader = csv.reader(matches_filehandle, delimiter='|')
    ordered_column_names = next(reader)
    matches_filehandle.seek(0)

    # 1. create pseudo-temporary table for the raw matches file
    # use the CSV's column order but grab the definitions from the goodtables schema
    unordered_column_list = column_list_from_goodtables_schema(
        goodtables_schema)
    primary_key = goodtables_schema['primaryKey']

    all_columns = [
        ('matched_id', 'varchar')
    ] + [col for col in unordered_column_list if col[0] in primary_key]
    column_definitions = dict((col[0], col) for col in all_columns)
    ordered_column_list = [
        column_definitions[ordered_column_name]
        for ordered_column_name in ordered_column_names
    ]
    logging.info('Final column list for temporary matches-only table: %s',
                 ordered_column_list)
    create = create_statement_from_column_list(ordered_column_list, table_name,
                                               primary_key)
    temp_table_name = 'temp_matched_merge_tbl'
    create = create.replace(table_name, temp_table_name)
    logging.info(create)
    db_engine.execute(create)

    # 2. copy data from filehandle to
    conn = db_engine.raw_connection()
    cursor = conn.cursor()
    pk = ','.join([col for col in primary_key])
    copy_stmt = 'copy {} from stdin with csv header delimiter as \'|\' force not null {}'.format(
        temp_table_name, pk)
    try:
        logging.info(copy_stmt)
        cursor.copy_expert(copy_stmt, matches_filehandle)
        logging.info('Status message after COPY: %s', cursor.statusmessage)
        for notice in conn.notices:
            logging.info('Notice from database connection: %s', notice)
        conn.commit()
        cursor = conn.cursor()
        cursor.execute('select * from {} limit 5'.format(temp_table_name))
        logging.info('First five rows: %s', [row for row in cursor])
        big_query = """
update {matched_table} as m set matched_id = regexp_replace(tmp.matched_id::text, '[^\w]', '', 'g')
        from {temp_table_name} tmp where ({pk}) """.format(
            create=create,
            matched_table=table_name,
            temp_table_name=temp_table_name,
            pk=' and '.join([
                'tmp.{col} = m.{col}'.format(col=col) for col in primary_key
            ]))
        logging.info('Updating matches in %s with rows from %s', table_name,
                     temp_table_name)
        logging.info(big_query)
        cursor.execute(big_query)
        logging.info('Status message after UPDATE: %s', cursor.statusmessage)
        conn.commit()
    except Exception as e:
        logging.error(
            'Error encountered! Rolling back merge of matched ids. Original error: %s',
            str(e))
        conn.rollback()
    finally:
        db_engine.execute('drop table if exists {}'.format(temp_table_name))