def sis_api_profiles(app, student_tables): from nessie.externals import redshift from nessie.jobs.import_sis_student_api import ImportSisStudentApi with mock_s3(app): ImportSisStudentApi().run_wrapped() sql = """SELECT sid, feed FROM student_test.sis_api_profiles""" return redshift.fetch(sql)
def sis_api_last_registrations(app, metadata_db, student_tables): from nessie.externals import redshift from nessie.jobs.import_registrations import ImportRegistrations with mock_s3(app): ImportRegistrations().run_wrapped() sql = """SELECT sid, feed FROM student_test.student_last_registrations""" return redshift.fetch(sql)
def test_malformed_filenames(self, app, caplog): (bucket, source_prefix, dest_prefix) = get_s3_refs(app) datestamp = 'all' caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object( bucket, f'{source_prefix}/2019/08/28/12345678_00012_1_May_7_2019_email.pdf' ).put(Body=b'extra chars in my name lol') m3.Object( bucket, f'{source_prefix}/2019/08/28/23456789_00052_1.png.png' ).put(Body=b'somehow i got a redundant .ext') m3.Object( bucket, f'{source_prefix}/2019/08/29/23456789_00053_1._DEGREE_COMPLETION_LETTER' ).put(Body=b'original file name mistaken for the .ext', ) m3.Object( bucket, f'{source_prefix}/2019/08/29/34567890_00014_2..7.19_(2)-edited_(1)-2_(1)_(1).xls' ).put(Body=b'is this a versioning scheme?', ) MigrateSisAdvisingNoteAttachments().run(datestamp=datestamp) assert 'Copied 4 attachments to the destination folder.' in caplog.text assert object_exists( m3, bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf') assert object_exists( m3, bucket, f'{dest_prefix}/23456789/23456789_00052_1.png') assert object_exists( m3, bucket, f'{dest_prefix}/23456789/23456789_00053_1') assert object_exists( m3, bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls')
def test_canvas_sync_metadata(self, app, metadata_db): """When given a job id, updates metadata on file sync.""" url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key = 'canvas/sonnet_submission_dim/sonnet-xlv.txt' with mock_s3(app): with open(_get_fixtures_path() + '/sonnet_xlv.html', 'r') as file: responses.add(responses.GET, url, body=file.read(), headers={'Content-Length': '767'}) # Run two successive sync jobs on the same file. The first succeeds, the second is skipped as # a duplicate. metadata.create_canvas_sync_status('job_1', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_1') assert result is True metadata.create_canvas_sync_status('job_2', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_2') assert result is False schema = app.config['REDSHIFT_SCHEMA_METADATA'] sync_metadata = redshift.fetch( f'SELECT * FROM {schema}.canvas_sync_job_status') snapshot_metadata = redshift.fetch( f'SELECT * FROM {schema}.canvas_synced_snapshots') assert len(sync_metadata) == 2 assert sync_metadata[0]['job_id'] == 'job_1' assert sync_metadata[0][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[0]['status'] == 'complete' assert sync_metadata[0]['source_size'] == 767 assert sync_metadata[0]['destination_size'] == 767 assert sync_metadata[0]['updated_at'] > sync_metadata[0][ 'created_at'] assert sync_metadata[1]['job_id'] == 'job_2' assert sync_metadata[1][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[1]['status'] == 'duplicate' assert sync_metadata[1]['source_size'] is None assert sync_metadata[1]['destination_size'] is None assert sync_metadata[1]['updated_at'] > sync_metadata[1][ 'created_at'] assert len(snapshot_metadata) == 1 assert snapshot_metadata[0]['filename'] == 'sonnet-xlv.txt' assert snapshot_metadata[0][ 'canvas_table'] == 'sonnet_submission_dim' assert snapshot_metadata[0][ 'url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert snapshot_metadata[0]['size'] == 767 assert snapshot_metadata[0]['created_at'] assert snapshot_metadata[0]['deleted_at'] is None
def set_up_to_succeed(app, caplog): (bucket, source_prefix, dest_prefix) = get_s3_refs(app) caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object(bucket, f'{source_prefix}/2017/01/18/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object(bucket, f'{source_prefix}/2018/12/22/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object(bucket, f'{source_prefix}/2019/08/29/34567890_00014_2.xls').put( Body=b'yet another note attachment') m3.Object(bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object(bucket, f'{dest_prefix}/23456789/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object(bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls').put( Body=b'yet another note attachment') yield assert 'No attachments missing on S3 when compared against the view.' in caplog.text
def test_run(self, app, metadata_db): """Uploads Canvas grade change logs to S3, then stores feeds in Redshift.""" with mock_s3(app): with override_config(app, 'TEST_CANVAS_COURSE_IDS', [1492459, 1488704, 1491827]): result = ImportCanvasGradeChangeLog().run_wrapped() assert result assert 'Canvas grade change log import completed for term 2178: 3 succeeded, ' in result assert '0 failed.' in result assert_background_job_status('ImportCanvasGradeChangeLog') schema = app.config['RDS_SCHEMA_METADATA'] count_results = rds.fetch( f'SELECT count(*) FROM {schema}.canvas_api_import_job_status') assert count_results[0]['count'] == 3 canvas_status_results = rds.fetch( f'SELECT DISTINCT status FROM {schema}.canvas_api_import_job_status' ) assert len(canvas_status_results) == 1 assert canvas_status_results[0]['status'] == 'created' sync_results = rds.fetch( f'SELECT * FROM {schema}.canvas_api_import_job_status LIMIT 1') assert sync_results[0]['job_id'].startswith( 'ImportCanvasGradeChangeLog_') assert sync_results[0]['course_id'] == '1492459' assert sync_results[0]['table_name'] == 'grade_change_log' assert sync_results[0]['details'] is None assert sync_results[0]['created_at'] assert sync_results[0]['updated_at']
def set_up_to_fail(app, caplog): (bucket, source_prefix, dest_prefix) = get_s3_refs(app) caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object(bucket, f'{source_prefix}/2017/01/18/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object(bucket, f'{source_prefix}/2018/12/22/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object(bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object(bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls').put( Body=b'yet another note attachment') m3.Object( bucket, f'{dest_prefix}/45678901/45678901_00192_4.xls' ).put( Body=b'bamboozled by a completely unexpected note attachment') with pytest.raises(BackgroundJobError) as e: yield assert 'Attachments verification found missing attachments or sync failures:' in str( e.value) assert '\'attachment_sync_failure_count\': 1' in str(e.value) assert '\'missing_s3_attachments_count\': 1' in str(e.value) assert '\'attachment_sync_failures\': [\'sis-data/sis-sftp/incremental/advising-notes/attachment-files/2018/12/22/23456789_00003_1.png\']' in str( e.value, ) assert '\'missing_s3_attachments\': [\'23456789_00003_1.png\']' in str( e.value) assert 'Attachments missing on S3 when compared against SIS notes views: 1' in caplog.text
def test_sync_canvas_snapshots(self, app, metadata_db, caplog): """Dispatches a complete sync job against fixtures.""" caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app): result = SyncCanvasSnapshots().run_wrapped() assert 'Canvas snapshot sync job dispatched to workers' in result assert_background_job_status('sync') assert 'Dispatched S3 sync of snapshot quiz_dim-00000-0ab80c7c.gz' in caplog.text assert 'Dispatched S3 sync of snapshot requests-00098-b14782f5.gz' in caplog.text assert '311 successful dispatches, 0 failures' in caplog.text schema = app.config['RDS_SCHEMA_METADATA'] count_results = rds.fetch(f'SELECT count(*) FROM {schema}.canvas_sync_job_status') assert count_results[0]['count'] == 311 canvas_status_results = rds.fetch(f'SELECT DISTINCT status FROM {schema}.canvas_sync_job_status') assert len(canvas_status_results) == 1 assert canvas_status_results[0]['status'] == 'created' sync_results = rds.fetch(f'SELECT * FROM {schema}.canvas_sync_job_status LIMIT 1') assert sync_results[0]['job_id'].startswith('sync_') assert sync_results[0]['filename'] == 'account_dim-00000-5eb7ee9e.gz' assert sync_results[0]['canvas_table'] == 'account_dim' assert 'account_dim/part-00505-5c40f1f3-b611-4f64-a007-67b775e984fe.c000.txt.gz' in sync_results[0]['source_url'] assert sync_results[0]['destination_url'] is None assert sync_results[0]['details'] is None assert sync_results[0]['created_at'] assert sync_results[0]['updated_at']
def test_import_student_photos(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_student_photos import ImportStudentPhotos caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): result = ImportStudentPhotos().run_wrapped() assert result == 'Student photo import completed: 1 succeeded, 9 had no photo available, 0 failed.' response = s3.get_keys_with_prefix('cal1card-data/photos') assert len(response) == 1 assert response[0] == 'cal1card-data/photos/61889.jpg' success_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'success'" ) assert len(success_rows) == 1 assert success_rows[0]['sid'] == '11667051' failure_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'failure'" ) assert len(failure_rows) == 0 not_found_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'photo_not_found'" ) assert len(not_found_rows) == 9
def test_import_registrations(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_registrations import ImportRegistrations rows = redshift.fetch('SELECT * FROM student_test.student_term_gpas') assert len(rows) == 0 rows = redshift.fetch('SELECT * FROM student_test.student_last_registrations') assert len(rows) == 0 caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 2 succeeded, 8 failed.' rows = redshift.fetch('SELECT * FROM student_test.student_term_gpas ORDER BY sid') assert len(rows) == 11 for row in rows[0:6]: assert row['sid'] == '11667051' for row in rows[7:10]: assert row['sid'] == '1234567890' row_2168 = next(r for r in rows if r['term_id'] == '2168') assert row_2168['gpa'] == Decimal('3.000') assert row_2168['units_taken_for_gpa'] == Decimal('8.0') rows = redshift.fetch('SELECT * FROM student_test.student_last_registrations ORDER BY sid') assert len(rows) == 2 assert rows[0]['sid'] == '11667051' assert rows[1]['sid'] == '1234567890' feed = json.loads(rows[1]['feed'], strict=False) assert feed['term']['id'] == '2172' assert feed['academicLevels'][0]['level']['description'] == 'Sophomore' rows = redshift.fetch('SELECT * FROM student_test.student_api_demographics ORDER BY sid') assert len(rows) == 2 assert rows[0]['sid'] == '11667051' assert rows[1]['sid'] == '1234567890' feed = json.loads(rows[1]['feed'], strict=False) assert feed['gender']['genderOfRecord']['description'] == 'Female'
def test_run_with_all_param(self, app, caplog): """When 'all' is provided, copies all files.""" (bucket, source_prefix, dest_prefix) = get_s3_refs(app) datestamp = 'all' caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object( bucket, f'{source_prefix}/2019/08/28/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/28/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/29/34567890_00014_2.xls').put( Body=b'ok to copy me') response = MigrateSisAdvisingNoteAttachments().run( datestamp=datestamp) assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files.' in caplog.text assert 'Copied 3 attachments to the destination folder.' in caplog.text assert response == ( 'SIS advising note attachment migration complete for sis-data/sis-sftp/incremental/advising-notes/attachment-files.' ) assert object_exists( m3, bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf') assert object_exists( m3, bucket, f'{dest_prefix}/23456789/23456789_00003_1.png') assert object_exists( m3, bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls')
def test_import_degree_progress(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_degree_progress import ImportDegreeProgress with mock_s3(app): result = ImportDegreeProgress().run_wrapped() assert result == 'SIS degree progress API import job completed: 1 succeeded, 9 returned no information, 0 failed.' rows = redshift.fetch(f"SELECT * FROM {student_schema()}.{student_schema_table('degree_progress')}") assert len(rows) == 1 assert rows[0]['sid'] == '11667051' feed = json.loads(rows[0]['feed']) assert feed['requirements']['entryLevelWriting']['status'] == 'Satisfied'
def test_update_manifests(self, app): """Updates manifests in S3.""" from nessie.jobs.create_sis_schema import CreateSisSchema with mock_s3(app): daily_path = get_s3_sis_daily_path() historical_path = app.config[ 'LOCH_S3_SIS_DATA_PATH'] + '/historical' self._upload_data_to_s3(daily_path, historical_path) assert CreateSisSchema().update_manifests() self._assert_complete_manifest(app, daily_path, historical_path)
def test_fallback_update_manifests(self, app): """Uses yesterday's news if today's is unavailable.""" from nessie.jobs.create_sis_schema import CreateSisSchema with mock_s3(app): yesterday = datetime.now() - timedelta(days=1) daily_path = get_s3_sis_daily_path(yesterday) historical_path = app.config[ 'LOCH_S3_SIS_DATA_PATH'] + '/historical' self._upload_data_to_s3(daily_path, historical_path) assert CreateSisSchema().update_manifests() self._assert_complete_manifest(app, daily_path, historical_path)
def test_run_with_no_param(self, mock_datetime, app, caplog, metadata_db, prior_job_status): """When no parameter is provided, copies new files since the last succesful run.""" (bucket, source_prefix, dest_prefix) = get_s3_refs(app) mock_datetime.utcnow.return_value = datetime(year=2019, month=8, day=29, hour=5, minute=21) caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object( bucket, f'{source_prefix}/2019/08/25/45678912_00027_1.pdf').put( Body=b'i\'ve already been copied') m3.Object( bucket, f'{source_prefix}/2019/08/26/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/28/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/29/34567890_00014_2.xls').put( Body=b'don\'t copy me') response = MigrateSisAdvisingNoteAttachments().run() assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/25.' not in caplog.text assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/26.' in caplog.text assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/27.' in caplog.text assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/28.' in caplog.text assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/29.' not in caplog.text assert 'Copied 1 attachments to the destination folder.' in caplog.text assert 'Copied 0 attachments to the destination folder.' in caplog.text assert response == ( 'SIS advising note attachment migration complete for sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/26, \ sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/27, \ sis-data/sis-sftp/incremental/advising-notes/attachment-files/2019/08/28.') assert not object_exists( m3, bucket, f'{dest_prefix}/45678912/45678912_00027_1.xls') assert object_exists( m3, bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf') assert object_exists( m3, bucket, f'{dest_prefix}/23456789/23456789_00003_1.png') assert not object_exists( m3, bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls')
def test_aborts_on_missing_term(self, app, caplog): from nessie.jobs.create_sis_schema import CreateSisSchema with mock_s3(app): daily_path = get_s3_sis_daily_path() historical_path = app.config[ 'LOCH_S3_SIS_DATA_PATH'] + '/historical' self._upload_data_to_s3(daily_path, historical_path) s3.delete_objects( [f'{daily_path}/enrollments/enrollments-2178.gz']) with capture_app_logs(app): with pytest.raises(BackgroundJobError) as e: CreateSisSchema().update_manifests() assert 'Expected filename enrollments-2178.gz not found in S3, aborting' in str( e.value)
def test_import_sis_student_api_v1(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_sis_student_api import ImportSisStudentApi with override_config(app, 'STUDENT_V1_API_PREFERRED', True): with mock_s3(app): result = ImportSisStudentApi().run_wrapped() assert result == 'SIS student API V1 import job completed: 4 succeeded, 6 failed.' rows = redshift.fetch('SELECT * FROM student_test.sis_api_profiles_v1 ORDER BY sid') assert len(rows) == 4 assert rows[0]['sid'] == '11667051' assert rows[1]['sid'] == '1234567890' assert rows[2]['sid'] == '2345678901' assert rows[3]['sid'] == '5000000000' feed = json.loads(rows[0]['feed'], strict=False) assert feed['names'][0]['familyName'] == 'Bear'
def test_canvas_sync_metadata(self, app, metadata_db): """Makes an API call and puts the result in S3.""" with mock_s3(app): bucket = app.config['LOCH_S3_BUCKET'] path = '/api/v1/audit/grade_change/courses/7654321' s3_key = f'{bucket}/grade_change_log/grade_change_log_7654321' result = ImportCanvasApiData().run_wrapped( course_id='7654321', path=path, s3_key=s3_key, job_id='ImportCanvasGradeChangeLog_123', ) assert result is True assert s3.object_exists(f'{s3_key}_0.json') is True
def test_import_sis_student_api(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_sis_student_api import ImportSisStudentApi with mock_s3(app): result = ImportSisStudentApi().run_wrapped() assert result == 'SIS student API import job completed: 2 succeeded, 6 failed.' rows = redshift.fetch( 'SELECT * FROM student_test.sis_api_profiles ORDER BY sid') print(rows) assert len(rows) == 2 assert rows[0]['sid'] == '11667051' assert rows[1]['sid'] == '2345678901' feed = json.loads(rows[0]['feed'], strict=False) assert feed['names'][0]['familyName'] == 'Bear'
def test_import_term_gpas(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_term_gpas import ImportTermGpas caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): result = ImportTermGpas().run_wrapped() assert result == 'Term GPA import completed: 1 succeeded, 0 returned no registrations, 7 failed.' rows = redshift.fetch( 'SELECT * FROM student_test.student_term_gpas') assert len(rows) == 7 for row in rows: assert row['sid'] == '11667051' row_2178 = next(r for r in rows if r['term_id'] == '2178') assert row_2178['gpa'] == Decimal('3.000') assert row_2178['units_taken_for_gpa'] == Decimal('8.0')
def test_import_sis_enrollments_api(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_sis_enrollments_api import ImportSisEnrollmentsApi with mock_s3(app): result = ImportSisEnrollmentsApi().run_wrapped() assert result == 'SIS enrollments API import completed for term 2178: 1 succeeded, 7 returned no enrollments, 0 failed.' rows = redshift.fetch( 'SELECT * FROM student_test.sis_api_drops_and_midterms') assert len(rows) == 1 assert rows[0]['sid'] == '11667051' feed = json.loads(rows[0]['feed']) assert feed['droppedPrimarySections'][0]['displayName'] == 'MUSIC 41C' assert feed['droppedPrimarySections'][0]['component'] == 'TUT' assert feed['droppedPrimarySections'][0]['sectionNumber'] == '002' assert feed['midtermGrades']['90100'] == 'D+'
def test_fallback_update_manifests(self, app): """Uses yesterday's news if today's is unavailable.""" with mock_s3(app): yesterday = datetime.now() - timedelta(days=1) daily_path = get_s3_sis_daily_path(yesterday) historical_path = app.config[ 'LOCH_S3_SIS_DATA_PATH'] + '/historical' manifest_path = app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests' s3.upload_data('some new course data', f'{daily_path}/courses/courses-aaa.gz') s3.upload_data('some more new course data', f'{daily_path}/courses/courses-bbb.gz') s3.upload_data('some new enrollment data', f'{daily_path}/enrollments/enrollments-ccc.gz') s3.upload_data('some old course data', f'{historical_path}/courses/courses-ddd.gz') s3.upload_data( 'some old enrollment data', f'{historical_path}/enrollments/enrollments-eee.gz') s3.upload_data( 'some perfectly antique enrollment data', f'{historical_path}/enrollments/enrollments-fff.gz') assert CreateSisSchema().update_manifests() courses_manifest = json.loads( s3.get_object_text(manifest_path + '/courses.json')) assert len(courses_manifest['entries']) == 3 assert courses_manifest['entries'][0][ 'url'] == f's3://{app.config["LOCH_S3_BUCKET"]}/{daily_path}/courses/courses-aaa.gz' assert courses_manifest['entries'][0]['meta'][ 'content_length'] == 20 enrollments_manifest = json.loads( s3.get_object_text(manifest_path + '/enrollments.json')) assert len(enrollments_manifest['entries']) == 3 assert ( enrollments_manifest['entries'][2]['url'] == f's3://{app.config["LOCH_S3_BUCKET"]}/{historical_path}/enrollments/enrollments-fff.gz' ) assert enrollments_manifest['entries'][2]['meta'][ 'content_length'] == 38
def test_import_sis_student_api(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_sis_student_api import ImportSisStudentApi initial_rows = redshift.fetch('SELECT * FROM student_test.sis_api_profiles ORDER BY sid') assert len(initial_rows) == 0 with mock_s3(app): result = ImportSisStudentApi().run_wrapped() assert result == 'SIS student API import job completed: 3 succeeded, 7 failed.' rows = redshift.fetch('SELECT * FROM student_test.sis_api_profiles ORDER BY sid') assert len(rows) == 3 assert rows[0]['sid'] == '11667051' feed = json.loads(rows[0]['feed'], strict=False) assert feed['names'][0]['familyName'] == 'Bear' assert feed['registrations'][0]['term']['id'] == '2178' assert rows[1]['sid'] == '1234567890' feed = json.loads(rows[1]['feed'], strict=False) # Needed to test proper sis_profile merging of last_registrations table. assert not feed.get('registrations') assert rows[2]['sid'] == '2345678901' feed = json.loads(rows[2]['feed'], strict=False) assert feed['registrations'][0]['term']['id'] == '2178'
def test_import_registrations_batch_mode(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_registrations import ImportRegistrations with mock_s3(app): ImportRegistrations().run_wrapped() rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 10 with override_config(app, 'CYCLICAL_API_IMPORT_BATCH_SIZE', 9): def _success_history_after_batch_import(): result = ImportRegistrations().run_wrapped(load_mode='batch') assert result == 'Registrations import completed: 1 succeeded, 8 failed.' rows = rds.fetch("SELECT * FROM nessie_metadata_test.registration_import_status WHERE status = 'success' ORDER BY updated_at") assert len(rows) == 2 assert rows[0]['updated_at'] < rows[1]['updated_at'] return (rows[0]['sid'], rows[1]['sid']) sid_1, sid_2 = _success_history_after_batch_import() assert _success_history_after_batch_import() == (sid_2, sid_1) assert _success_history_after_batch_import() == (sid_1, sid_2)
def test_list_keys_matching_prefix(self, app): """Lists keys matching prefix.""" bucket = app.config['LOCH_S3_BUCKET'] prefix = app.config[ 'LOCH_S3_CANVAS_DATA_PATH_CURRENT_TERM'] + '/requests' with mock_s3(app) as m: m.Object(bucket, f'{prefix}/requests-aaa.gz').put(Body=b'some data') m.Object(bucket, f'{prefix}/requests-bbb.gz').put(Body=b'some more data') m.Object(bucket, f'{prefix}/requests-ccc.gz').put(Body=b'yet more data') m.Object(bucket, 'another-prefix/requests-ddd.gz').put( Body=b'utterly unrelated data') response = s3.get_keys_with_prefix(prefix) assert len(response) == 3 assert f'{prefix}/requests-aaa.gz' in response assert f'{prefix}/requests-bbb.gz' in response assert f'{prefix}/requests-ccc.gz' in response
def test_first_time_run_with_no_param(self, mock_datetime, app, caplog, metadata_db): """When no parameter is provided and there is no prior successful run, copies all files.""" (bucket, source_prefix, dest_prefix) = get_s3_refs(app) mock_datetime.utcnow.return_value = datetime(year=2019, month=8, day=29, hour=5, minute=21) caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object( bucket, f'{source_prefix}/2019/08/28/12345678_00012_1.pdf').put( Body=b'a note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/28/23456789_00003_1.png').put( Body=b'another note attachment') m3.Object( bucket, f'{source_prefix}/2019/08/29/34567890_00014_2.xls').put( Body=b'ok to copy me') response = MigrateSisAdvisingNoteAttachments().run() assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/.' in caplog.text assert 'Copied 3 attachments to the destination folder.' in caplog.text assert response == ( 'SIS advising note attachment migration complete for sis-data/sis-sftp/incremental/advising-notes/attachment-files/.' ) assert object_exists( m3, bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf') assert object_exists( m3, bucket, f'{dest_prefix}/23456789/23456789_00003_1.png') assert object_exists( m3, bucket, f'{dest_prefix}/34567890/34567890_00014_2.xls')
def test_run_with_invalid_param(self, app, caplog): """When invalid value is provided, job completes but copies zero files.""" (bucket, source_prefix, dest_prefix) = get_s3_refs(app) datestamp = 'wrong!#$&' caplog.set_level(logging.INFO) with capture_app_logs(app): with mock_s3(app, bucket=bucket) as m3: m3.Object( bucket, f'{source_prefix}/2019/08/28/12345678_00012_1.pdf').put( Body=b'a note attachment') response = MigrateSisAdvisingNoteAttachments().run( datestamp=datestamp) assert 'Will copy files from /sis-data/sis-sftp/incremental/advising-notes/attachment-files/wrong!#$&.' in caplog.text assert 'Copied 0 attachments to the destination folder.' in caplog.text assert response == ( 'SIS advising note attachment migration complete for sis-data/sis-sftp/incremental/advising-notes/attachment-files/wrong!#$&.' ) assert not object_exists( m3, bucket, f'{dest_prefix}/12345678/12345678_00012_1.pdf')
def test_metadata_tracked(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_registrations import ImportRegistrations rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 0 caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): ImportRegistrations().run_wrapped() rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status') assert len(rows) == 10 assert len([r for r in rows if r['status'] == 'failure']) == 8 assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success' result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 0 succeeded, 8 failed.' result = ImportRegistrations().run_wrapped(load_mode='all') assert result == 'Registrations import completed: 2 succeeded, 8 failed.' rds.execute("DELETE FROM nessie_metadata_test.registration_import_status WHERE sid = '11667051'") result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 1 succeeded, 8 failed.' assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success' rds.execute("UPDATE nessie_metadata_test.registration_import_status SET status='failure' WHERE sid = '11667051'") result = ImportRegistrations().run_wrapped() assert result == 'Registrations import completed: 1 succeeded, 8 failed.' assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success'
def test_generate_demographics_feeds(self, app, student_tables): """Builds JSON feeds and uploads to S3.""" from nessie.jobs.create_edl_schema import CreateEdlSchema with override_config(app, 'FEATURE_FLAG_ENTERPRISE_DATA_LAKE', True): with mock_s3(app): CreateEdlSchema().generate_demographics_feeds() rows = redshift.fetch( f'SELECT * FROM {student_schema()}.student_demographics') assert len(rows) == 11 assert rows[0]['sid'] == '11667051' feed = json.loads(rows[0]['feed']) assert feed['gender'] == 'Female' assert feed['ethnicities'] == [ 'African-American / Black', 'Chinese / Chinese-American', 'East Indian / Pakistani' ] assert feed['nationalities'] == ['Singapore'] assert feed['underrepresented'] is True assert feed['visa']['visa_type'] == 'PR' assert feed['visa']['visa_status'] == 'A' assert rows[1]['sid'] == '1234567890' feed = json.loads(rows[1]['feed']) assert feed['gender'] == 'Male' assert feed['ethnicities'] == [ 'Mexican / Mexican-American / Chicano', 'White' ] assert feed['nationalities'] == ['Iran (Islamic Republic Of)'] assert feed['underrepresented'] is True assert feed['visa']['visa_type'] == 'F1' assert feed['visa']['visa_status'] == 'A' assert rows[2]['sid'] == '2345678901' feed = json.loads(rows[2]['feed']) assert feed['gender'] == 'Female' assert feed['ethnicities'] == ['White'] assert feed['nationalities'] == ['Taiwan'] assert feed['underrepresented'] is False assert feed['visa']['visa_type'] is None assert feed['visa']['visa_status'] is None assert rows[3]['sid'] == '3456789012' feed = json.loads(rows[3]['feed']) assert feed['gender'] == 'Decline to State' assert feed['ethnicities'] == [ 'American Indian / Alaska Native', 'Filipino / Filipino-American' ] assert feed['nationalities'] == ['Korea, Republic of'] assert feed['underrepresented'] is True assert feed['visa']['visa_type'] == 'J1' assert feed['visa']['visa_status'] == 'G' assert rows[4]['sid'] == '5000000000' feed = json.loads(rows[4]['feed']) assert feed['gender'] == 'Female' assert feed['ethnicities'] == ['Not Specified'] assert feed['nationalities'] == [] assert feed['underrepresented'] is False assert feed['visa']['visa_type'] is None assert feed['visa']['visa_status'] is None assert rows[7]['sid'] == '8901234567' feed = json.loads(rows[7]['feed']) assert feed['gender'] == 'Decline to State' assert feed['ethnicities'] == ['Not Specified'] assert feed['nationalities'] == [] assert feed['underrepresented'] is False assert feed['visa']['visa_type'] is None assert feed['visa']['visa_status'] is None assert rows[9]['sid'] == '9000000000' feed = json.loads(rows[9]['feed']) assert feed['gender'] == 'Nonbinary' assert feed['ethnicities'] == [ 'African-American / Black', 'Other Asian', 'Pacific Islander' ] assert feed['nationalities'] == [ "Lao People's Democratic Rep", 'Saint Kitts and Nevis' ] assert feed['underrepresented'] is True assert feed['visa']['visa_type'] is None assert feed['visa']['visa_status'] is None
def test_resync_canvas_snapshots(self, app, metadata_db, caplog): """Dispatches a complete resync job against fixtures.""" caplog.set_level(logging.INFO) snapshots = canvas_data.get_snapshots()['files'] def mock_metadata(job_id, snapshot, status, destination_size): metadata.create_canvas_sync_status(job_id, snapshot['filename'], snapshot['table'], snapshot['url']) key = '/'.join([ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ]) metadata.update_canvas_sync_status( job_id, key, status, source_size=1048576, destination_size=destination_size) old_sync_job = 'sync_152550000' latest_sync_job = 'sync_152560000' # The older job should be ignored by the resync. for snapshot in snapshots[0:5]: mock_metadata(old_sync_job, snapshot, 'complete', 1048576) for snapshot in snapshots[5:10]: mock_metadata(old_sync_job, snapshot, 'error', None) # The latest job synced five files successfully and ran into three problems. for snapshot in snapshots[10:15]: mock_metadata(latest_sync_job, snapshot, 'complete', 1048576) stalled = snapshots[15] errored = snapshots[16] size_discrepancy = snapshots[17] mock_metadata(latest_sync_job, stalled, 'streaming', None) mock_metadata(latest_sync_job, errored, 'error', None) mock_metadata(latest_sync_job, size_discrepancy, 'complete', 65536) schema = app.config['RDS_SCHEMA_METADATA'] with capture_app_logs(app): assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 18 with mock_s3(app): result = ResyncCanvasSnapshots().run_wrapped() assert 'Canvas snapshot resync job dispatched to workers' in result assert_background_job_status('resync') assert f"Dispatched S3 resync of snapshot {stalled['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {errored['filename']}" in caplog.text assert f"Dispatched S3 resync of snapshot {size_discrepancy['filename']}" in caplog.text assert '3 successful dispatches, 0 failures' in caplog.text assert rds.fetch( f'SELECT count(*) FROM {schema}.canvas_sync_job_status' )[0]['count'] == 21 resync_results = rds.fetch( f"SELECT * FROM {schema}.canvas_sync_job_status WHERE job_id LIKE 'resync%'" ) assert len(resync_results) == 3 urls = [] for r in resync_results: assert r['job_id'].startswith('resync_') assert r['filename'] assert r['canvas_table'] assert r['created_at'] assert r['updated_at'] urls.append(r['source_url']) assert stalled['url'] in urls assert errored['url'] in urls assert size_discrepancy['url'] in urls