def test_match_finished(write_match_log_mock, write_matches_to_db_mock): s3 = s3fs.S3FileSystem() s3.touch('bucket') with open_sesame('s3://bucket/matcher/jur1/hmis_service_stays/matched', 'wb') as f: f.write(b'test') with open_sesame('s3://bucket/matcher/jur1/jail_bookings/matched', 'wb') as f: f.write(b'test') with open_sesame('s3://bucket/matcher/jur1/jail_booking_charges/matched', 'wb') as f: f.write(b'test') match_finished(matched_results_paths={ 'hmis_service_stays': 's3://bucket/matcher/jur1/hmis_service_stays/matched', 'jail_bookings': 's3://bucket/matcher/jur1/jail_bookings/matched', 'jail_booking_charges': 's3://bucket/matcher/jur1/jail_booking_charges/matched' }, match_job_id='123abc', match_start_at=datetime.today(), match_complete_at=datetime.today(), match_status=True, match_runtime=timedelta(seconds=0), upload_id=None) assert write_match_log_mock.called assert write_matches_to_db_mock.call_count == 3 for call_args in write_matches_to_db_mock.call_args_list: assert call_args[1]['jurisdiction'] == 'jur1' assert call_args[1]['event_type'] in { 'jail_bookings', 'jail_booking_charges', 'hmis_service_stays' }
def match_finished(matched_results_paths, match_job_id, match_start_at, match_complete_at, match_status, match_runtime, upload_id=None): try: logger.info('Writing to match log') write_match_log(db_session=db_session, match_job_id=match_job_id, match_start_at=match_start_at, match_complete_at=match_complete_at, match_status=match_status, match_runtime=match_runtime, upload_id=upload_id) logger.info('Writing matches to db') for event_type, filename in matched_results_paths.items(): jurisdiction = filename.split('/')[-3] logger.info( 'Writing matches from event type %s and filename %s to db. Parsed jurisdiction %s out of filename', event_type, filename, jurisdiction) with open_sesame(filename, 'rb') as matches_filehandle: write_matches_to_db(db_engine=engine, event_type=event_type, jurisdiction=jurisdiction, matches_filehandle=matches_filehandle) except Exception as e: logger.error('Error encountered during match_finished: %s', str(e)) finally: logger.info('All done!')
def test_good_file(self, request_mock): with full_rig_with_s3() as (app, engine): upload_id = self.do_upload(app, request_mock) # okay, here's what we really want to test. # call the merge endpoint response = app.post( '/api/upload/merge_file?uploadId={}'.format(upload_id)) response_data = json.loads(response.get_data().decode('utf-8')) assert response_data['status'] == 'success' # make sure that there is a new merged file on s3 expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/merged' with open_sesame(expected_s3_path, 'rb') as expected_s3_file: reader = csv.reader(expected_s3_file) assert len([row for row in reader]) == ROWS_IN_GOOD_HMIS_FILE # and make sure that the merge log has a record of this assert db_session.query(MergeLog).filter( MergeLog.upload_id == '123-456').one # make sure that the master table has been bootstrapped master_table = generate_master_table_name('boone', 'hmis_service_stays') total_rows = db_session.query( 'count(*) from {}'.format(master_table)).one() assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, ) # make sure that we filled in some matched ids total_rows = db_session.query( 'count(matched_id is not null) from {}'.format( master_table)).one() assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )
def sync_merged_file_to_storage(jurisdiction, event_type, db_engine): full_path = merged_file_path(jurisdiction, event_type) table_name = generate_master_table_name(jurisdiction, event_type) with open_sesame(full_path, 'wb') as outfile: cursor = db_engine.raw_connection().cursor() copy_stmt = 'copy "{}" to stdout with csv header delimiter as \'|\''.format( table_name) cursor.copy_expert(copy_stmt, outfile)
def test_good_file(self): with full_rig_with_s3() as (app, engine): response = app.post( '/api/upload/upload_file?jurisdiction=boone&eventType=hmis_service_stays', content_type='multipart/form-data', data={ 'file_field': (open(GOOD_HMIS_FILE, 'rb'), 'myfile.csv') }) response_data = json.loads(response.get_data().decode('utf-8')) assert response_data['status'] == 'validating' assert 'jobKey' in response_data assert 'message' in response_data job_key = response_data['jobKey'] # get validation result and upload to s3 response = app.get('/api/upload/validated_result/' + job_key) response_data = json.loads(response.get_data().decode('utf-8')) assert 'validation' in response_data assert response_data['validation']['status'] == 'valid' assert response_data['validation']['jobKey'] == job_key assert 'upload_result' in response_data assert 'rowCount' in response_data['upload_result'] assert 'exampleRows' in response_data['upload_result'] assert 'uploadId' in response_data['upload_result'] assert 'fieldOrder' in response_data['upload_result'] current_date = date.today().isoformat() expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/uploaded/{}/{}'.format( current_date, response_data['upload_result']['uploadId']) with open_sesame(expected_s3_path) as expected_s3_file: with open_sesame(GOOD_HMIS_FILE) as source_file: # we do not expect the file on s3 to be the same as the # uploaded source file - missing columns should be filled in s3_df = pd.read_csv(expected_s3_file) source_df = pd.read_csv(source_file, sep='|') assert source_df.equals(s3_df[source_df.columns.tolist()]) assert db_session.query(Upload).filter( Upload.id == response_data['upload_result']['uploadId']).one
def test_copy_raw_table_to_db_duplicate(self): # we create a file with duplicates # upon copying to raw table, we expect the duplicate error to be tripped # and presented in a user-friendly format with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) with moto.mock_s3(): s3 = s3fs.S3FileSystem() s3.touch('test-bucket') full_s3_path = 's3://test-bucket/123-456' with open_sesame(full_s3_path, 'wb') as writefile: writer = csv.writer(writefile) for row in [ [ u'internal_person_id', u'internal_event_id', u'location_id', 'full_name', 'birthdate', 'ssn' ], [ u'123456', u'456789', u'A345', 'Jack T. Ripper', '1896-04-10', '345-45-6789' ], [ u'123457', u'456780', u'A345', 'Jack L. Ripper', '1896-04-10', '345-45-6780' ], [ u'123457', u'456780', u'A345', 'Jack L. Ripper', '1896-04-10', '345-45-6780' ], [ u'123457', u'456780', u'A346', 'Jack L. Ripper', '1896-04-10', '345-45-6780' ], ]: writer.writerow(row) jurisdiction = 'test' event_type = 'test' with self.assertRaisesRegexp( ValueError, expected_regex= r'.*line 4.*internal_event_id, location_id.*456780.*A345.*' ): copy_raw_table_to_db(full_s3_path, event_type, '123-456', engine)
def test_good_file(self, request_mock): with full_rig_with_s3() as (app, engine): upload_id = self.do_upload(app, request_mock) # okay, here's what we really want to test. # call the merge endpoint response = app.post( '/api/upload/merge_file?uploadId={}'.format(upload_id)) response_data = json.loads(response.get_data().decode('utf-8')) assert response_data['status'] == 'success' # make sure that there is a new merged file on s3 expected_s3_path = 's3://test-bucket/boone/jail_bookings/merged' with open_sesame(expected_s3_path, 'rb') as expected_s3_file: reader = csv.reader(expected_s3_file) assert len([row for row in reader]) == 11 # and make sure that the merge log has a record of this assert db_session.query(MergeLog).filter( MergeLog.upload_id == '123-456').one # and make sure that the raw table is no longer there assert not table_exists(generate_raw_table_name(upload_id), db_session.bind)
def copy_raw_table_to_db(full_path, event_type, upload_id, db_engine): goodtables_schema = load_schema_file(event_type) logging.info('Loaded schema: %s', goodtables_schema) table_name = generate_raw_table_name(upload_id) create_statement = create_statement_from_goodtables_schema( goodtables_schema, table_name) logging.info('Assembled create table statement: %s', create_statement) db_engine.execute(create_statement) logging.info('Successfully created table') primary_key = primary_key_statement(goodtables_schema['primaryKey']) with open_sesame(full_path, 'rb') as infile: conn = db_engine.raw_connection() cursor = conn.cursor() copy_stmt = 'copy "{}" from stdin with csv force not null {} header delimiter as \',\' '.format( table_name, primary_key) try: cursor.copy_expert(copy_stmt, infile) conn.commit() except psycopg2.IntegrityError as e: error_message = str(e) conn.rollback() if 'duplicate key value violates unique constraint' not in error_message: raise error_message_lines = error_message.split('\n') if len(error_message_lines) < 3: raise line_no_match = re.match(r'^.*(line \d+)', error_message.split('\n')[2]) if not line_no_match: raise line_no = line_no_match.group(1) raise ValueError( f"Duplicate key value found on {line_no}. {error_message_lines[1]}" ) finally: conn.close() logging.info('Successfully loaded file') return table_name
def sync_upload_metadata( upload_id, event_type, jurisdiction, flask_user_id, given_filename, local_filename, db_session, s3_upload_path=None, validate_start_time=None, validate_complete_time=None, validate_status=None, upload_start_time=None, upload_complete_time=None, upload_status=None, ): with open_sesame(local_filename, 'rb') as infile: num_rows = sum(1 for _ in infile) infile.seek(0) file_size = os.fstat(infile.fileno()).st_size file_hash = md5(infile.read()).hexdigest() write_upload_log(db_session=db_session, upload_id=upload_id, jurisdiction_slug=jurisdiction, event_type_slug=event_type, user_id=flask_user_id, given_filename=given_filename, upload_start_time=upload_start_time, upload_complete_time=upload_complete_time, upload_status=upload_status, validate_start_time=validate_start_time, validate_complete_time=validate_complete_time, validate_status=validate_status, num_rows=num_rows, file_size=file_size, file_hash=file_hash, s3_upload_path=s3_upload_path)
def test_upload_to_s3(): with moto.mock_s3(): s3 = s3fs.S3FileSystem() s3.touch('test-bucket') with makeNamedTemporaryCSV([ [u'col1', u'col2'], [u'val1_1', u'val1_2'], [u'v\xedl2_1', u'val2_2'], ]) as filename: sample_config = { 'raw_uploads_path': 's3://test-bucket/{jurisdiction}/{event_type}/uploaded/{date}/{upload_id}' } with patch.dict('backend.utils.app_config', sample_config): final_upload_path = upload_path('boone', 'hmis', '123-567-abc') upload_to_storage(final_upload_path, filename) current_date = date.today().isoformat() expected_s3_path = 's3://test-bucket/boone/hmis/uploaded/{}/123-567-abc'.format( current_date) with open_sesame(expected_s3_path, 'rb') as expected_s3_file: content = expected_s3_file.read() assert 'val1_1' in content.decode('utf-8')
def test_copy_raw_table_to_db(): # start with an s3 upload path that we assume to exist # and given the event type and jurisdiction # we expect the raw table to be copied into a new table with proper schema and return the table name with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) with moto.mock_s3(): s3 = s3fs.S3FileSystem() s3.touch('test-bucket') full_s3_path = 's3://test-bucket/123-456' with open_sesame(full_s3_path, 'wb') as writefile: writer = csv.writer(writefile) for row in [ [ u'internal_person_id', u'internal_event_id', u'location_id', 'full_name', 'birthdate', 'ssn' ], [ u'123456', u'456789', u'A345', 'Jack T. Ripper', '1896-04-10', '345-45-6789' ], [ u'123457', u'456780', u'A345', 'Jack L. Ripper', '1896-04-10', '345-45-6780' ], [ u'123457', u'456780', u'A346', 'Jack L. Ripper', '1896-04-10', '345-45-6780' ], ]: writer.writerow(row) jurisdiction = 'test' event_type = 'test' written_raw_table = copy_raw_table_to_db(full_s3_path, event_type, '123-456', engine) assert sum(1 for _ in engine.execute('select * from "{}"'.format( written_raw_table))) == 3
def upload_to_storage(full_path, local_filename): with open_sesame(full_path, 'wb') as outfile: with open_sesame(local_filename, 'rb') as infile: outfile.write(infile.read())