def test_write_matches_to_db(self): BOOTSTRAPPED_HMIS_FILE = 'sample_data/matched/bootstrapped_hmis_data_20180401.csv' MATCHES_HMIS_FILE = 'sample_data/matched/matchesonly_hmis_data.csv' with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) create_and_populate_master_table(table_name='hmis_service_stays', db_engine=db_engine, file_path=BOOTSTRAPPED_HMIS_FILE) # generate expected matches by taking the first column of the matched id spreadsheet with open(MATCHES_HMIS_FILE, 'rb') as f: reader = csv.reader(f, delimiter='|') next(reader) expected_matched_ids = set( re.sub(r'\W+', '', row[0]) for row in reader) # write these matches to the DB with open(MATCHES_HMIS_FILE, 'rb') as fh: write_matches_to_db(db_engine=db_engine, event_type='hmis_service_stays', jurisdiction='boone', matches_filehandle=fh) full_table_name = generate_master_table_name( 'boone', 'hmis_service_stays') retrieved_matched_ids = set([ row[0] for row in db_engine.execute( 'select distinct matched_id from {}'.format( full_table_name)) ]) assert retrieved_matched_ids == expected_matched_ids
def create_and_populate_master_table(table_name, db_engine, file_path=None): full_table_name = generate_master_table_name('boone', table_name) create_table_query = f""" DROP TABLE IF EXISTS {full_table_name}; CREATE TABLE {full_table_name} ({DATA_FIELDS[table_name]})""" db_engine.execute(create_table_query) if file_path: df = pd.read_csv(file_path) df['internal_event_id'] = df['internal_event_id'].apply(str) df['matched_id'] = df['matched_id'].apply(str) if table_name == "jail_bookings": df['booking_number'] = df['booking_number'].apply(lambda x: str(x) if x else None) df['jail_entry_date'] = pd.to_datetime(df['jail_entry_date']) df['jail_exit_date'] = pd.to_datetime(df['jail_exit_date']) df.to_sql(full_table_name, db_engine, if_exists='append', index=False) db_engine.execute( "update {} set booking_number = null where booking_number = 'nan'" .format(full_table_name)) elif table_name == "hmis_service_stays": df['client_location_start_date'] = pd.to_datetime( df['client_location_start_date']) df['client_location_end_date'] = pd.to_datetime( df['client_location_end_date']) df.to_sql(full_table_name, db_engine, if_exists='append', index=False)
def test_good_file(self, request_mock): with full_rig_with_s3() as (app, engine): upload_id = self.do_upload(app, request_mock) # okay, here's what we really want to test. # call the merge endpoint response = app.post( '/api/upload/merge_file?uploadId={}'.format(upload_id)) response_data = json.loads(response.get_data().decode('utf-8')) assert response_data['status'] == 'success' # make sure that there is a new merged file on s3 expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/merged' with open_sesame(expected_s3_path, 'rb') as expected_s3_file: reader = csv.reader(expected_s3_file) assert len([row for row in reader]) == ROWS_IN_GOOD_HMIS_FILE # and make sure that the merge log has a record of this assert db_session.query(MergeLog).filter( MergeLog.upload_id == '123-456').one # make sure that the master table has been bootstrapped master_table = generate_master_table_name('boone', 'hmis_service_stays') total_rows = db_session.query( 'count(*) from {}'.format(master_table)).one() assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, ) # make sure that we filled in some matched ids total_rows = db_session.query( 'count(matched_id is not null) from {}'.format( master_table)).one() assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )
def expunge(jurisdiction, event_type): """Expunges all data for a given jurisdiction and event type""" s3 = s3fs.S3FileSystem() base_path = app_config['base_path'].format(jurisdiction=jurisdiction, event_type=event_type) match_cache_path = app_config['match_cache_path'].format( jurisdiction=jurisdiction) master_table = generate_master_table_name(jurisdiction, event_type) msg = 'Expunge all data for jurisdiction {} and event_type {}?'.format( jurisdiction, event_type) msg += '\n\ns3 paths (recursive!): {}, {}'.format(base_path, match_cache_path) msg += '\n\ndatabase tables: {}'.format(master_table) msg += '\n\nYou should not do this while the system is validating or matching.' msg += '\n\nRemove all this?' shall = input("%s (y/N) " % msg).lower() == 'y' click.echo(shall) if not shall: click.echo("Expunge aborted") click.echo("Removing base path " + base_path) s3.rm(base_path, recursive=True) click.echo("Removing match cache path" + match_cache_path) s3.rm(match_cache_path, recursive=True) click.echo("Truncating master database table " + master_table) engine.execute('truncate {}'.format(master_table))
def test_update_overlapping(self): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) Base.metadata.create_all(engine) db_session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine)) self.populate_seed_data(db_session) # merge in a table with one new row and one row that has the same # primary key but a new name: hopefully we won't be seeing much like # this in production but we do want to be sure that we use the new # data in these cases (for instance, data corrections) OLD = '456780' UPDATED = '456789' NEW = '456791' raw_table_name = '234-567' new_data = [ [ u'123458', UPDATED, 'A345', 'Jack F. Ripper', '1896-04-10', '345-45-6789' ], [ u'123459', NEW, 'A345', 'Jack R. Ripper', '1896-04-10', '345-45-6780' ], ] create_and_populate_raw_table('234-567', new_data, db_session.bind) upsert_raw_table_to_master(raw_table_name, self.jurisdiction, self.event_type, '234-567', db_session) master_table_name = generate_master_table_name( self.jurisdiction, self.event_type) result = self.get_pks_and_names(master_table_name, db_session) assert len(result) == 4 # the duplicated event id should only be present once assert result == [ ('456780', 'A345', 'Jack L. Ripper'), ('456780', 'A346', 'Jack L. Ripper'), ('456789', 'A345', 'Jack F. Ripper'), ('456791', 'A345', 'Jack R. Ripper'), ] # let's check the timestamps timestamp_result = dict( (row[0], (row[1], row[2])) for row in db_session.execute(''' select "internal_event_id", "inserted_ts", "updated_ts" from {} '''.format(master_table_name))) # created timestamp of an old row that was not updated should be the same as an old row that was updated assert timestamp_result[OLD][0] == timestamp_result[UPDATED][0] # updated timestamp of an old row that was not updated should be lower than an old row that was updated assert timestamp_result[OLD][1] < timestamp_result[UPDATED][1] # created and updated timestamps of totally new rows should be equivalent assert timestamp_result[NEW][0] == timestamp_result[NEW][1] merge_logs = db_session.query(MergeLog).all() assert len(merge_logs) == 2 assert merge_logs[0].upload_id == '123-456' assert merge_logs[1].upload_id == '234-567' assert merge_logs[0].total_unique_rows == 3 assert merge_logs[1].total_unique_rows == 2 assert merge_logs[0].new_unique_rows == 3 assert merge_logs[1].new_unique_rows == 1
def sync_merged_file_to_storage(jurisdiction, event_type, db_engine): full_path = merged_file_path(jurisdiction, event_type) table_name = generate_master_table_name(jurisdiction, event_type) with open_sesame(full_path, 'wb') as outfile: cursor = db_engine.raw_connection().cursor() copy_stmt = 'copy "{}" to stdout with csv header delimiter as \'|\''.format( table_name) cursor.copy_expert(copy_stmt, outfile)
def source_data_to_filehandle(jurisdiction, event_type): master_table = generate_master_table_name(jurisdiction, event_type) out_filehandle = io.BytesIO() cursor = db.engine.raw_connection().cursor() copy_stmt = 'copy {} to stdout with csv header delimiter as \',\''.format( master_table) cursor.copy_expert(copy_stmt, out_filehandle) out_filehandle.seek(0) return out_filehandle
def create_merged_table(jurisdiction, event_type, db_session): master_table_name = generate_master_table_name(jurisdiction, event_type) goodtables_schema = load_schema_file(event_type) full_column_list = master_table_column_list(goodtables_schema) create = create_statement_from_column_list(full_column_list, master_table_name, goodtables_schema['primaryKey']) # create table if it does not exist logging.info('Assembled create-if-not-exists table statement: %s', create) db_session.execute(create)
def create_and_populate_merged_table(table_name, data, db_engine): schema = load_schema_file('test') n_fields = len(schema['fields']) for row in data: assert len( row ) == n_fields, "sample merged data must have same # of fields as test schema" placeholder_string = ', '.join(['%s'] * n_fields) master_table_name = generate_master_table_name('test', 'test') column_list = master_table_column_list(schema) create = create_statement_from_column_list(column_list) db_engine.execute(create) for row in data: db_engine.execute( 'insert into "{}" values ({}, now(), now())'.format( master_table_name, placeholder_string), *row)
def upsert_raw_table_to_master(raw_table_name, jurisdiction, event_type, upload_id, db_session): create_merged_table(jurisdiction, event_type, db_session) master_table_name = generate_master_table_name(jurisdiction, event_type) goodtables_schema = load_schema_file(event_type) base_column_list = column_list_from_goodtables_schema(goodtables_schema) # use new postgres 'on conflict' functionality to upsert update_statements = [ ' "{column}" = EXCLUDED."{column}"'.format(column=column_def[0]) for column_def in base_column_list ] start_ts = datetime.today() insert_sql = ''' insert into {master} select raw.*, '{new_ts}' inserted_ts, '{new_ts}' updated_ts, row_number() over ()::text || '{event_type}' as matched_id from "{raw}" as raw on conflict ({primary_key}) do update set {update_string}, updated_ts = '{new_ts}' '''.format(raw=raw_table_name, master=master_table_name, event_type=event_type, primary_key=', '.join([ "\"{}\"".format(col) for col in goodtables_schema['primaryKey'] ]), update_string=', '.join(update_statements), new_ts=start_ts.isoformat()) logging.info('Executing insert: %s', insert_sql) db_session.execute(insert_sql) end_ts = datetime.today() merge_log = MergeLog( upload_id=upload_id, total_unique_rows=total_unique_rows(raw_table_name, goodtables_schema['primaryKey'], db_session), new_unique_rows=new_unique_rows(master_table_name, start_ts, db_session), merge_start_timestamp=start_ts, merge_complete_timestamp=end_ts, ) db_session.add(merge_log) db_session.execute('drop table "{}"'.format(raw_table_name)) db_session.commit() return merge_log.id
def test_update_nonoverlapping(self): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) Base.metadata.create_all(engine) db_session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine)) # do initial insert, representing the first time data is uploaded self.populate_seed_data(db_session) raw_table_name = '234-567' new_data = [ [ u'123458', u'456790', 'A345', 'Jack F. Ripper', '1896-04-10', '345-45-6789' ], [ u'123459', u'456791', 'A345', 'Jack R. Ripper', '1896-04-10', '345-45-6780' ], ] create_and_populate_raw_table('234-567', new_data, db_session.bind) upsert_raw_table_to_master(raw_table_name, self.jurisdiction, self.event_type, '234-567', db_session) master_table_name = generate_master_table_name( self.jurisdiction, self.event_type) result = self.get_pks_and_names(master_table_name, db_session) assert len(result) == 5 assert result == [ ('456780', 'A345', 'Jack L. Ripper'), ('456780', 'A346', 'Jack L. Ripper'), ('456789', 'A345', 'Jack T. Ripper'), ('456790', 'A345', 'Jack F. Ripper'), ('456791', 'A345', 'Jack R. Ripper'), ] merge_logs = db_session.query(MergeLog).all() assert len(merge_logs) == 2 assert merge_logs[0].upload_id == '123-456' assert merge_logs[1].upload_id == '234-567' assert merge_logs[0].total_unique_rows == 3 assert merge_logs[1].total_unique_rows == 2 assert merge_logs[0].new_unique_rows == 3 assert merge_logs[1].new_unique_rows == 2
def test_new_table(self): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) Base.metadata.create_all(engine) db_session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine)) # do initial insert, representing the first time data is uploaded self.populate_seed_data(db_session) master_table_name = generate_master_table_name( self.jurisdiction, self.event_type) result = self.get_pks_and_names(master_table_name, db_session) assert len(result) == 3 assert result == [ ('456780', 'A345', 'Jack L. Ripper'), ('456780', 'A346', 'Jack L. Ripper'), ('456789', 'A345', 'Jack T. Ripper'), ] merge_logs = db_session.query(MergeLog).all() assert len(merge_logs) == 1 assert merge_logs[0].upload_id == '123-456' assert merge_logs[0].total_unique_rows == 3 assert merge_logs[0].new_unique_rows == 3
def write_matches_to_db(db_engine, event_type, jurisdiction, matches_filehandle): goodtables_schema = load_schema_file(event_type) table_name = generate_master_table_name(event_type=event_type, jurisdiction=jurisdiction) logging.info('Writing matches for %s / %s to table %s', event_type, jurisdiction, table_name) reader = csv.reader(matches_filehandle, delimiter='|') ordered_column_names = next(reader) matches_filehandle.seek(0) # 1. create pseudo-temporary table for the raw matches file # use the CSV's column order but grab the definitions from the goodtables schema unordered_column_list = column_list_from_goodtables_schema( goodtables_schema) primary_key = goodtables_schema['primaryKey'] all_columns = [ ('matched_id', 'varchar') ] + [col for col in unordered_column_list if col[0] in primary_key] column_definitions = dict((col[0], col) for col in all_columns) ordered_column_list = [ column_definitions[ordered_column_name] for ordered_column_name in ordered_column_names ] logging.info('Final column list for temporary matches-only table: %s', ordered_column_list) create = create_statement_from_column_list(ordered_column_list, table_name, primary_key) temp_table_name = 'temp_matched_merge_tbl' create = create.replace(table_name, temp_table_name) logging.info(create) db_engine.execute(create) # 2. copy data from filehandle to conn = db_engine.raw_connection() cursor = conn.cursor() pk = ','.join([col for col in primary_key]) copy_stmt = 'copy {} from stdin with csv header delimiter as \'|\' force not null {}'.format( temp_table_name, pk) try: logging.info(copy_stmt) cursor.copy_expert(copy_stmt, matches_filehandle) logging.info('Status message after COPY: %s', cursor.statusmessage) for notice in conn.notices: logging.info('Notice from database connection: %s', notice) conn.commit() cursor = conn.cursor() cursor.execute('select * from {} limit 5'.format(temp_table_name)) logging.info('First five rows: %s', [row for row in cursor]) big_query = """ update {matched_table} as m set matched_id = regexp_replace(tmp.matched_id::text, '[^\w]', '', 'g') from {temp_table_name} tmp where ({pk}) """.format( create=create, matched_table=table_name, temp_table_name=temp_table_name, pk=' and '.join([ 'tmp.{col} = m.{col}'.format(col=col) for col in primary_key ])) logging.info('Updating matches in %s with rows from %s', table_name, temp_table_name) logging.info(big_query) cursor.execute(big_query) logging.info('Status message after UPDATE: %s', cursor.statusmessage) conn.commit() except Exception as e: logging.error( 'Error encountered! Rolling back merge of matched ids. Original error: %s', str(e)) conn.rollback() finally: db_engine.execute('drop table if exists {}'.format(temp_table_name))
def get_records_by_time(start_time, end_time, jurisdiction, limit, offset, order_column, order, set_status): hmis_table = generate_master_table_name(jurisdiction, 'hmis_service_stays') bookings_table = generate_master_table_name(jurisdiction, 'jail_bookings') hmis_exists = table_exists(hmis_table, db.engine) bookings_exists = table_exists(bookings_table, db.engine) if not hmis_exists: raise ValueError( 'HMIS table {} does not exist. Please try again later.'.format( hmis_table)) if not bookings_exists: raise ValueError( 'Bookings table {} does not exist. Please try again later.'.format( bookings_table)) columns = [ ("matched_id::text", 'matched_id'), ("coalesce(hmis_summary.first_name, jail_summary.first_name)", 'first_name'), ("coalesce(hmis_summary.last_name, jail_summary.last_name)", 'last_name'), ("hmis_summary.hmis_id", 'hmis_id'), ("hmis_summary.hmis_contact", 'hmis_contact'), ("hmis_summary.last_hmis_contact", 'last_hmis_contact'), ("hmis_summary.cumu_hmis_days", 'cumu_hmis_days'), ("jail_summary.jail_id", 'jail_id'), ("jail_summary.jail_contact", 'jail_contact'), ("jail_summary.last_jail_contact", 'last_jail_contact'), ("jail_summary.cumu_jail_days", 'cumu_jail_days'), ("jail_summary.percent_bookings_homeless_flag", "percent_bookings_homeless_flag"), ("coalesce(hmis_summary.hmis_contact, 0) + coalesce(jail_summary.jail_contact, 0)", 'total_contact'), ] if not any(alias == order_column for expression, alias in columns): raise ValueError( 'Given order column expression does not match any alias in query. Exiting to avoid SQL injection attacks' ) base_query = """WITH booking_duration_lookup AS ( select coalesce(nullif(booking_number, ''), internal_event_id) as booking_id, max(coalesce(location_date::text, jail_entry_date)) as most_recent_location_date, max(case when jail_exit_date is not null then date_part('day', jail_exit_date::timestamp - jail_entry_date::timestamp)::int \ else date_part('day', updated_ts::timestamp - jail_entry_date::timestamp)::int end) as length_of_stay, bool_or(coalesce(homeless, 'N') = 'Y') as any_homeless FROM ( SELECT * FROM {booking_table} WHERE not (jail_entry_date < %(start_date)s AND jail_exit_date < %(start_date)s) and not (jail_entry_date > %(end_date)s AND jail_exit_date > %(end_date)s) ) AS jail group by 1 ), hmis_summary AS ( SELECT matched_id::text, string_agg(distinct internal_person_id::text, ',') as hmis_id, sum( case when client_location_end_date is not null then date_part('day', client_location_end_date::timestamp - client_location_start_date::timestamp) \ else date_part('day', updated_ts::timestamp - client_location_start_date::timestamp) end )::int as cumu_hmis_days, count(*) AS hmis_contact, to_char(max(client_location_start_date::timestamp), 'YYYY-MM-DD') as last_hmis_contact, max(first_name) as first_name, max(last_name) as last_name FROM ( SELECT * FROM {hmis_table} WHERE not (client_location_start_date < %(start_date)s AND client_location_end_date < %(start_date)s) and not (client_location_start_date > %(end_date)s AND client_location_end_date > %(end_date)s) ) AS hmis GROUP BY matched_id ), jail_summary AS ( SELECT matched_id::text, string_agg(distinct coalesce(nullif(internal_person_id, ''), inmate_number)::text, ',') as jail_id, sum(jail.length_of_stay) as cumu_jail_days, count(distinct(coalesce(nullif(booking_number, ''), internal_event_id))) AS jail_contact, to_char(max(jail_entry_date::timestamp), 'YYYY-MM-DD') as last_jail_contact, max(first_name) as first_name, max(last_name) as last_name, round(100 * count(case when any_homeless then 1 else null end) / count(*)::float)::text || '%%' as percent_bookings_homeless_flag FROM ( SELECT * FROM {booking_table} join booking_duration_lookup bdl on ( bdl.booking_id = coalesce(nullif(booking_number, ''), internal_event_id) and bdl.most_recent_location_date = coalesce(location_date::text, jail_entry_date) ) WHERE not (jail_entry_date < %(start_date)s AND jail_exit_date < %(start_date)s) and not (jail_entry_date > %(end_date)s AND jail_exit_date > %(end_date)s) ) AS jail GROUP BY matched_id ) SELECT {columns} FROM hmis_summary FULL OUTER JOIN jail_summary USING(matched_id) """.format( hmis_table=hmis_table, booking_table=bookings_table, columns=",\n".join("{} as {}".format(expression, alias) for expression, alias in columns), ) logging.info('Querying table records') if order not in {'asc', 'desc'}: raise ValueError( 'Given order direction is not valid. Exiting to avoid SQL injection attacks' ) if not isinstance(limit, int) and not limit.isdigit() and limit != 'ALL': raise ValueError( 'Given limit is not valid. Existing to avoid SQL injection attacks' ) filter_by_status = { 'Jail': 'jail_summary.matched_id is not null', 'HMIS': 'hmis_summary.matched_id is not null', 'Intersection': 'hmis_summary.matched_id = jail_summary.matched_id' } status_filter = filter_by_status.get(set_status, 'true') rows_to_show = [ dict(row) for row in db.engine.execute( """ {} where {} order by {} {} limit {} offset %(offset)s""".format(base_query, status_filter, order_column, order, limit), start_date=start_time, end_date=end_time, offset=offset, ) ] query = """ SELECT *, DATE_PART('day', {exit}::timestamp - {start}::timestamp) as days FROM {table_name} WHERE not ({start} < %(start_time)s AND {exit} < %(start_time)s) and not ({start} > %(end_time)s AND {exit} > %(end_time)s) """ hmis_query = query.format(table_name=hmis_table, start="client_location_start_date", exit="client_location_end_date") bookings_query = query.format(table_name=bookings_table, start="jail_entry_date", exit="jail_exit_date") logging.info('Done querying table records') logging.info('Querying venn diagram stats') venn_diagram_stats = next( db.engine.execute('''select count(distinct(hmis.matched_id)) as hmis_size, count(distinct(bookings.matched_id)) as bookings_size, count(distinct(case when hmis.matched_id = bookings.matched_id then hmis.matched_id else null end)) as shared_size, count(distinct(matched_id)) from ({}) hmis full outer join ({}) bookings using (matched_id) '''.format(hmis_query, bookings_query), start_time=start_time, end_time=end_time)) counts_by_status = { 'HMIS': venn_diagram_stats[0], 'Jail': venn_diagram_stats[1], 'Intersection': venn_diagram_stats[2] } logging.info('Done querying venn diagram stats') venn_diagram_data = [{ "sets": ["Jail"], "size": venn_diagram_stats[1] }, { "sets": ["Homeless"], "size": venn_diagram_stats[0] }, { "sets": ["Jail", "Homeless"], "size": venn_diagram_stats[2] }] logging.info('Retrieving bar data from database') filtered_data = retrieve_bar_data(hmis_table, bookings_table, start_time, end_time) logging.info('Done retrieving bar data from database') filtered_data['tableData'] = rows_to_show return { "vennDiagramData": venn_diagram_data, "totalTableRows": counts_by_status.get(set_status, venn_diagram_stats[3]), "filteredData": filtered_data }