Exemplo n.º 1
0
def test_match_finished(write_match_log_mock, write_matches_to_db_mock):
    s3 = s3fs.S3FileSystem()
    s3.touch('bucket')
    with open_sesame('s3://bucket/matcher/jur1/hmis_service_stays/matched',
                     'wb') as f:
        f.write(b'test')
    with open_sesame('s3://bucket/matcher/jur1/jail_bookings/matched',
                     'wb') as f:
        f.write(b'test')
    with open_sesame('s3://bucket/matcher/jur1/jail_booking_charges/matched',
                     'wb') as f:
        f.write(b'test')
    match_finished(matched_results_paths={
        'hmis_service_stays':
        's3://bucket/matcher/jur1/hmis_service_stays/matched',
        'jail_bookings':
        's3://bucket/matcher/jur1/jail_bookings/matched',
        'jail_booking_charges':
        's3://bucket/matcher/jur1/jail_booking_charges/matched'
    },
                   match_job_id='123abc',
                   match_start_at=datetime.today(),
                   match_complete_at=datetime.today(),
                   match_status=True,
                   match_runtime=timedelta(seconds=0),
                   upload_id=None)
    assert write_match_log_mock.called
    assert write_matches_to_db_mock.call_count == 3
    for call_args in write_matches_to_db_mock.call_args_list:
        assert call_args[1]['jurisdiction'] == 'jur1'
        assert call_args[1]['event_type'] in {
            'jail_bookings', 'jail_booking_charges', 'hmis_service_stays'
        }
Exemplo n.º 2
0
def match_finished(matched_results_paths,
                   match_job_id,
                   match_start_at,
                   match_complete_at,
                   match_status,
                   match_runtime,
                   upload_id=None):
    try:
        logger.info('Writing to match log')
        write_match_log(db_session=db_session,
                        match_job_id=match_job_id,
                        match_start_at=match_start_at,
                        match_complete_at=match_complete_at,
                        match_status=match_status,
                        match_runtime=match_runtime,
                        upload_id=upload_id)
        logger.info('Writing matches to db')
        for event_type, filename in matched_results_paths.items():
            jurisdiction = filename.split('/')[-3]
            logger.info(
                'Writing matches from event type %s and filename %s to db. Parsed jurisdiction %s out of filename',
                event_type, filename, jurisdiction)
            with open_sesame(filename, 'rb') as matches_filehandle:
                write_matches_to_db(db_engine=engine,
                                    event_type=event_type,
                                    jurisdiction=jurisdiction,
                                    matches_filehandle=matches_filehandle)
    except Exception as e:
        logger.error('Error encountered during match_finished: %s', str(e))

    finally:
        logger.info('All done!')
Exemplo n.º 3
0
    def test_good_file(self, request_mock):
        with full_rig_with_s3() as (app, engine):
            upload_id = self.do_upload(app, request_mock)
            # okay, here's what we really want to test.
            # call the merge endpoint
            response = app.post(
                '/api/upload/merge_file?uploadId={}'.format(upload_id))
            response_data = json.loads(response.get_data().decode('utf-8'))
            assert response_data['status'] == 'success'
            # make sure that there is a new merged file on s3
            expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/merged'
            with open_sesame(expected_s3_path, 'rb') as expected_s3_file:
                reader = csv.reader(expected_s3_file)
                assert len([row for row in reader]) == ROWS_IN_GOOD_HMIS_FILE

            # and make sure that the merge log has a record of this
            assert db_session.query(MergeLog).filter(
                MergeLog.upload_id == '123-456').one

            # make sure that the master table has been bootstrapped
            master_table = generate_master_table_name('boone',
                                                      'hmis_service_stays')
            total_rows = db_session.query(
                'count(*) from {}'.format(master_table)).one()
            assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )

            # make sure that we filled in some matched ids
            total_rows = db_session.query(
                'count(matched_id is not null) from {}'.format(
                    master_table)).one()
            assert total_rows == (ROWS_IN_GOOD_HMIS_FILE - 1, )
Exemplo n.º 4
0
def sync_merged_file_to_storage(jurisdiction, event_type, db_engine):
    full_path = merged_file_path(jurisdiction, event_type)
    table_name = generate_master_table_name(jurisdiction, event_type)
    with open_sesame(full_path, 'wb') as outfile:
        cursor = db_engine.raw_connection().cursor()
        copy_stmt = 'copy "{}" to stdout with csv header delimiter as \'|\''.format(
            table_name)
        cursor.copy_expert(copy_stmt, outfile)
Exemplo n.º 5
0
    def test_good_file(self):
        with full_rig_with_s3() as (app, engine):
            response = app.post(
                '/api/upload/upload_file?jurisdiction=boone&eventType=hmis_service_stays',
                content_type='multipart/form-data',
                data={
                    'file_field': (open(GOOD_HMIS_FILE, 'rb'), 'myfile.csv')
                })
            response_data = json.loads(response.get_data().decode('utf-8'))
            assert response_data['status'] == 'validating'
            assert 'jobKey' in response_data
            assert 'message' in response_data

            job_key = response_data['jobKey']

            # get validation result and upload to s3
            response = app.get('/api/upload/validated_result/' + job_key)
            response_data = json.loads(response.get_data().decode('utf-8'))

            assert 'validation' in response_data
            assert response_data['validation']['status'] == 'valid'
            assert response_data['validation']['jobKey'] == job_key

            assert 'upload_result' in response_data
            assert 'rowCount' in response_data['upload_result']
            assert 'exampleRows' in response_data['upload_result']
            assert 'uploadId' in response_data['upload_result']
            assert 'fieldOrder' in response_data['upload_result']

            current_date = date.today().isoformat()
            expected_s3_path = 's3://test-bucket/boone/hmis_service_stays/uploaded/{}/{}'.format(
                current_date, response_data['upload_result']['uploadId'])
            with open_sesame(expected_s3_path) as expected_s3_file:
                with open_sesame(GOOD_HMIS_FILE) as source_file:
                    # we do not expect the file on s3 to be the same as the
                    # uploaded source file - missing columns should be filled in
                    s3_df = pd.read_csv(expected_s3_file)
                    source_df = pd.read_csv(source_file, sep='|')
                    assert source_df.equals(s3_df[source_df.columns.tolist()])

            assert db_session.query(Upload).filter(
                Upload.id == response_data['upload_result']['uploadId']).one
Exemplo n.º 6
0
 def test_copy_raw_table_to_db_duplicate(self):
     # we create a file with duplicates
     # upon copying to raw table, we expect the duplicate error to be tripped
     # and presented in a user-friendly format
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         with moto.mock_s3():
             s3 = s3fs.S3FileSystem()
             s3.touch('test-bucket')
             full_s3_path = 's3://test-bucket/123-456'
             with open_sesame(full_s3_path, 'wb') as writefile:
                 writer = csv.writer(writefile)
                 for row in [
                     [
                         u'internal_person_id', u'internal_event_id',
                         u'location_id', 'full_name', 'birthdate', 'ssn'
                     ],
                     [
                         u'123456', u'456789', u'A345', 'Jack T. Ripper',
                         '1896-04-10', '345-45-6789'
                     ],
                     [
                         u'123457', u'456780', u'A345', 'Jack L. Ripper',
                         '1896-04-10', '345-45-6780'
                     ],
                     [
                         u'123457', u'456780', u'A345', 'Jack L. Ripper',
                         '1896-04-10', '345-45-6780'
                     ],
                     [
                         u'123457', u'456780', u'A346', 'Jack L. Ripper',
                         '1896-04-10', '345-45-6780'
                     ],
                 ]:
                     writer.writerow(row)
             jurisdiction = 'test'
             event_type = 'test'
             with self.assertRaisesRegexp(
                     ValueError,
                     expected_regex=
                     r'.*line 4.*internal_event_id, location_id.*456780.*A345.*'
             ):
                 copy_raw_table_to_db(full_s3_path, event_type, '123-456',
                                      engine)
Exemplo n.º 7
0
    def test_good_file(self, request_mock):
        with full_rig_with_s3() as (app, engine):
            upload_id = self.do_upload(app, request_mock)
            # okay, here's what we really want to test.
            # call the merge endpoint
            response = app.post(
                '/api/upload/merge_file?uploadId={}'.format(upload_id))
            response_data = json.loads(response.get_data().decode('utf-8'))
            assert response_data['status'] == 'success'
            # make sure that there is a new merged file on s3
            expected_s3_path = 's3://test-bucket/boone/jail_bookings/merged'
            with open_sesame(expected_s3_path, 'rb') as expected_s3_file:
                reader = csv.reader(expected_s3_file)
                assert len([row for row in reader]) == 11

            # and make sure that the merge log has a record of this
            assert db_session.query(MergeLog).filter(
                MergeLog.upload_id == '123-456').one

            # and make sure that the raw table is no longer there
            assert not table_exists(generate_raw_table_name(upload_id),
                                    db_session.bind)
Exemplo n.º 8
0
def copy_raw_table_to_db(full_path, event_type, upload_id, db_engine):
    goodtables_schema = load_schema_file(event_type)
    logging.info('Loaded schema: %s', goodtables_schema)
    table_name = generate_raw_table_name(upload_id)
    create_statement = create_statement_from_goodtables_schema(
        goodtables_schema, table_name)
    logging.info('Assembled create table statement: %s', create_statement)
    db_engine.execute(create_statement)
    logging.info('Successfully created table')
    primary_key = primary_key_statement(goodtables_schema['primaryKey'])
    with open_sesame(full_path, 'rb') as infile:
        conn = db_engine.raw_connection()
        cursor = conn.cursor()
        copy_stmt = 'copy "{}" from stdin with csv force not null {}  header delimiter as \',\' '.format(
            table_name, primary_key)
        try:
            cursor.copy_expert(copy_stmt, infile)
            conn.commit()
        except psycopg2.IntegrityError as e:
            error_message = str(e)
            conn.rollback()
            if 'duplicate key value violates unique constraint' not in error_message:
                raise
            error_message_lines = error_message.split('\n')
            if len(error_message_lines) < 3:
                raise
            line_no_match = re.match(r'^.*(line \d+)',
                                     error_message.split('\n')[2])
            if not line_no_match:
                raise
            line_no = line_no_match.group(1)
            raise ValueError(
                f"Duplicate key value found on {line_no}. {error_message_lines[1]}"
            )
        finally:
            conn.close()
    logging.info('Successfully loaded file')
    return table_name
Exemplo n.º 9
0
def sync_upload_metadata(
    upload_id,
    event_type,
    jurisdiction,
    flask_user_id,
    given_filename,
    local_filename,
    db_session,
    s3_upload_path=None,
    validate_start_time=None,
    validate_complete_time=None,
    validate_status=None,
    upload_start_time=None,
    upload_complete_time=None,
    upload_status=None,
):
    with open_sesame(local_filename, 'rb') as infile:
        num_rows = sum(1 for _ in infile)
        infile.seek(0)
        file_size = os.fstat(infile.fileno()).st_size
        file_hash = md5(infile.read()).hexdigest()

        write_upload_log(db_session=db_session,
                         upload_id=upload_id,
                         jurisdiction_slug=jurisdiction,
                         event_type_slug=event_type,
                         user_id=flask_user_id,
                         given_filename=given_filename,
                         upload_start_time=upload_start_time,
                         upload_complete_time=upload_complete_time,
                         upload_status=upload_status,
                         validate_start_time=validate_start_time,
                         validate_complete_time=validate_complete_time,
                         validate_status=validate_status,
                         num_rows=num_rows,
                         file_size=file_size,
                         file_hash=file_hash,
                         s3_upload_path=s3_upload_path)
Exemplo n.º 10
0
def test_upload_to_s3():
    with moto.mock_s3():
        s3 = s3fs.S3FileSystem()
        s3.touch('test-bucket')
        with makeNamedTemporaryCSV([
            [u'col1', u'col2'],
            [u'val1_1', u'val1_2'],
            [u'v\xedl2_1', u'val2_2'],
        ]) as filename:
            sample_config = {
                'raw_uploads_path':
                's3://test-bucket/{jurisdiction}/{event_type}/uploaded/{date}/{upload_id}'
            }
            with patch.dict('webapp.utils.app_config', sample_config):
                final_upload_path = upload_path('boone', 'hmis', '123-567-abc')
                upload_to_storage(final_upload_path, filename)

        current_date = date.today().isoformat()
        expected_s3_path = 's3://test-bucket/boone/hmis/uploaded/{}/123-567-abc'.format(
            current_date)
        with open_sesame(expected_s3_path, 'rb') as expected_s3_file:
            content = expected_s3_file.read()
            assert 'val1_1' in content.decode('utf-8')
Exemplo n.º 11
0
def test_copy_raw_table_to_db():
    # start with an s3 upload path that we assume to exist
    # and given the event type and jurisdiction
    # we expect the raw table to be copied into a new table with proper schema and return the table name
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        with moto.mock_s3():
            s3 = s3fs.S3FileSystem()
            s3.touch('test-bucket')
            full_s3_path = 's3://test-bucket/123-456'
            with open_sesame(full_s3_path, 'wb') as writefile:
                writer = csv.writer(writefile)
                for row in [
                    [
                        u'internal_person_id', u'internal_event_id',
                        u'location_id', 'full_name', 'birthdate', 'ssn'
                    ],
                    [
                        u'123456', u'456789', u'A345', 'Jack T. Ripper',
                        '1896-04-10', '345-45-6789'
                    ],
                    [
                        u'123457', u'456780', u'A345', 'Jack L. Ripper',
                        '1896-04-10', '345-45-6780'
                    ],
                    [
                        u'123457', u'456780', u'A346', 'Jack L. Ripper',
                        '1896-04-10', '345-45-6780'
                    ],
                ]:
                    writer.writerow(row)
            jurisdiction = 'test'
            event_type = 'test'
            written_raw_table = copy_raw_table_to_db(full_s3_path, event_type,
                                                     '123-456', engine)
            assert sum(1 for _ in engine.execute('select * from "{}"'.format(
                written_raw_table))) == 3
Exemplo n.º 12
0
def upload_to_storage(full_path, local_filename):
    with open_sesame(full_path, 'wb') as outfile:
        with open_sesame(local_filename, 'rb') as infile:
            outfile.write(infile.read())