def _import_calnet_attributes(advisor_ids): calnet_attributes = calnet.client(app).search_csids(advisor_ids) calnet_row_count = len(calnet_attributes) if len(advisor_ids) != calnet_row_count: ldap_csids = [person['csid'] for person in calnet_attributes] missing = set(advisor_ids) - set(ldap_csids) app.logger.warning(f'Looked for {len(advisor_ids)} advisor CSIDs but only found {calnet_row_count} : missing {missing}') advisor_rows = [] for index, a in enumerate(calnet_attributes): sid = a['csid'] app.logger.info(f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {calnet_row_count})') first_name, last_name = calnet.split_sortable_name(a) data = [ a['uid'], sid, first_name, last_name, a['title'], calnet.get_dept_code(a), a['email'], a['campus_email'], ] advisor_rows.append(encoded_tsv_row(data)) s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv' app.logger.info(f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(advisor_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes; COPY {redshift_schema_advisor_internal}.advisor_attributes FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) was_successful = redshift.execute(query) app.logger.info('Advisor attributes imported.' if was_successful else 'Error on Redshift copy: aborting job.') return was_successful
def run(self): app.logger.info('Starting CalNet schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CALNET'] redshift.drop_external_schema(external_schema) sid_snapshot_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_CALNET_DATA_PATH'], 'sids', ]) resolved_ddl = resolve_sql_template( 'create_calnet_schema.template.sql', sid_snapshot_path=sid_snapshot_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'CalNet schema creation job completed.' else: raise BackgroundJobError('CalNet schema creation job failed.')
def refresh_current_term_index(self): today = datetime.now(pytz.utc).astimezone( pytz.timezone(app.config['TIMEZONE'])).date() current_term = self.get_sis_current_term(today) if current_term: term_id = current_term['term_id'] # Check if the advance enrollment period has started for the next two upcoming terms. future_term_id = term_id for _ in range(2): term_id = next_term_id(term_id) term = self.get_sis_term_for_id(term_id) advance_enrollment_period = 0 if term_id[3] == '2': advance_enrollment_period = 95 elif term_id[3] == '5': advance_enrollment_period = 124 elif term_id[3] == '8': advance_enrollment_period = 140 if term['term_begins'] - timedelta( days=advance_enrollment_period) < today: future_term_id = term_id with rds.transaction() as transaction: transaction.execute( f'TRUNCATE {rds_schema}.current_term_index') columns = ['current_term_name', 'future_term_name'] values = tuple([ current_term['term_name'], term_name_for_sis_id(future_term_id) ]) if transaction.execute( f'INSERT INTO {rds_schema}.current_term_index ({", ".join(columns)}) VALUES {values} ' ): transaction.commit() else: transaction.rollback() raise BackgroundJobError( 'Error refreshing RDS current term index.')
def refresh_student_enrollment_term(self, term_id, enrollment_term_map): with tempfile.TemporaryFile() as enrollment_term_file: for (sid, sid_term_feed) in enrollment_term_map.items(): enrollment_term_file.write( encoded_tsv_row([sid, term_id, json.dumps(sid_term_feed)]) + b'\n') drop_staged_enrollment_term(term_id) write_file_to_staging('student_enrollment_terms', enrollment_term_file, len(enrollment_term_map), term_id) with redshift.transaction() as transaction: refresh_from_staging('student_enrollment_terms', term_id, None, transaction, truncate_staging=False) if not transaction.commit(): raise BackgroundJobError( f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).' )
def run(self): app.logger.info('Starting SIS Advising Notes schema creation job...') daily_path = get_s3_sis_sysadm_daily_path() bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): daily_path = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): raise BackgroundJobError('No timely SIS advising notes data found, aborting') else: app.logger.info('Falling back to yesterday\'s SIS advising notes data') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) self.create_historical_tables(external_schema) self.create_internal_schema(external_schema, daily_path) app.logger.info('Redshift schema created. Creating RDS indexes...') self.create_indexes() app.logger.info('RDS indexes created.') return 'SIS Advising Notes schema creation job completed.'
def generate_feeds(self): non_advisee_sids = queries.get_fetched_non_advisees() non_advisee_sids = [r['sid'] for r in non_advisee_sids] profile_count = self.generate_student_profile_table(non_advisee_sids) enrollment_count = self.generate_student_enrollments_table( non_advisee_sids) if profile_count and enrollment_count: resolved_ddl_rds = resolve_sql_template( 'update_rds_indexes_student_profiles_hist_enr.template.sql') if rds.execute(resolved_ddl_rds): app.logger.info('RDS indexes updated.') else: raise BackgroundJobError( 'Failed to refresh RDS copies of non-advisee data.') else: app.logger.warning( 'No non-advisee data loaded into Redshift; will not refresh RDS copies.' ) return f'Generated {profile_count} non-advisee profiles, {enrollment_count} enrollments.'
def create_lrs_caliper_relationalize_job(self): job_name = app.config['LRS_CANVAS_GLUE_JOB_NAME'] glue_role = app.config['LRS_GLUE_SERVICE_ROLE'] job_command = { 'Name': 'glueetl', 'ScriptLocation': 's3://{}/{}'.format( app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'], app.config['LRS_CANVAS_GLUE_JOB_SCRIPT_PATH'], ), } default_arguments = { '--LRS_INCREMENTAL_TRANSIENT_BUCKET': app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'], '--LRS_CANVAS_CALIPER_SCHEMA_PATH': app.config['LRS_CANVAS_CALIPER_SCHEMA_PATH'], '--LRS_CANVAS_CALIPER_INPUT_DATA_PATH': app.config['LRS_CANVAS_CALIPER_INPUT_DATA_PATH'], '--LRS_GLUE_TEMP_DIR': app.config['LRS_GLUE_TEMP_DIR'], '--LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH': app.config['LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH'], '--job-bookmark-option': 'job-bookmark-disable', } response = glue.create_glue_job(job_name, glue_role, job_command, default_arguments) if not response: raise BackgroundJobError('Failed to create Glue job.') elif response['Name']: app.logger.info(f'Response : {response}') app.logger.info( f'Glue Job created successfully with Job Name : {response}') return True else: return False
def unload_to_etl(self, schema, bucket, timestamped=True): s3_url = 's3://' + bucket + '/' + app.config['LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT'] if timestamped: s3_url += '/' + localize_datetime(datetime.now()).strftime('%Y/%m/%d/statements_%Y%m%d_%H%M%S_') else: s3_url += '/statements' redshift_iam_role = app.config['REDSHIFT_IAM_ROLE'] if not redshift.execute( f""" UNLOAD ('SELECT statement FROM {schema}.statements') TO '{s3_url}' IAM_ROLE '{redshift_iam_role}' ENCRYPTED DELIMITER AS ' ' NULL AS '' ALLOWOVERWRITE PARALLEL OFF MAXFILESIZE 1 gb """, ): raise BackgroundJobError(f'Error executing Redshift unload to {s3_url}.') self.verify_unloaded_count(s3_url)
def run(self, datestamp=None): s3_attachment_sync_failures = [] missing_s3_attachments = [] app.logger.info('Starting SIS Advising Note attachments validation job...') dest_prefix = app.config['LOCH_S3_ADVISING_NOTE_ATTACHMENT_DEST_PATH'] for source_prefix in self.source_paths(datestamp): app.logger.info(f'Will validate files from {source_prefix}.') s3_attachment_sync_failures.extend(self.verify_attachment_migration(source_prefix, dest_prefix)) missing_s3_attachments = self.find_missing_notes_view_attachments(dest_prefix) if s3_attachment_sync_failures or missing_s3_attachments: verification_results = { 'attachment_sync_failure_count': len(s3_attachment_sync_failures), 'missing_s3_attachments_count': len(missing_s3_attachments), 'attachment_sync_failures': s3_attachment_sync_failures, 'missing_s3_attachments': missing_s3_attachments, } raise BackgroundJobError(f'Attachments verification found missing attachments or sync failures: {verification_results}.') else: return 'Note attachment verification completed successfully. No missing attachments or sync failures found.'
def run(self): app.logger.info('Starting ASC profile generation job...') asc_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)', schema=asc_schema_identifier, ) profile_rows = [] sids_for_inactive_deletion = [] for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')): rows_for_student = list(rows_for_student) # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not # a team membership, a bit of brutal simplification is needed. Students who are active in at least # one sport have inactive team memberships dropped. any_active_athletics = reduce( operator.or_, [r['active'] for r in rows_for_student], False) if any_active_athletics: rows_for_student = [r for r in rows_for_student if r['active']] sids_for_inactive_deletion.append(sid) athletics_profile = { 'athletics': [], 'inIntensiveCohort': rows_for_student[0]['intensive'], 'isActiveAsc': rows_for_student[0]['active'], 'statusAsc': rows_for_student[0]['status_asc'], } for row in rows_for_student: athletics_profile['athletics'].append({ 'groupCode': row['group_code'], 'groupName': row['group_name'], 'name': row['group_name'], 'teamCode': row['team_code'], 'teamName': row['team_name'], }) profile_rows.append( encoded_tsv_row([sid, json.dumps(athletics_profile)])) s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.student_profiles; COPY {redshift_schema_asc}.student_profiles FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(asc_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') if sids_for_inactive_deletion: redshift.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) rds.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) return 'ASC profile generation complete.'
def index_appointment_advisors(self): resolved_ddl = resolve_sql_template('index_sis_appointment_advisors.template.sql') if rds.execute(resolved_ddl): app.logger.info('Indexed appointment advisors.') else: raise BackgroundJobError('Failed to index appointment advisors.')
def _rollback(): transaction.rollback() raise BackgroundJobError( f'Failed to populate table {student_schema()}.{table} from staging schema.' )
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'
def update_manifests(self): app.logger.info('Updating manifests...') # Because the SIS S3 copy is managed by a different application running on a different schedule, # it may have been made before midnight by Nessie-time. s3_sis_daily = get_s3_sis_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = get_s3_sis_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_sis_daily): raise BackgroundJobError('No timely SIS S3 data found') else: app.logger.info( 'Falling back to SIS S3 daily data for yesterday') courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses', full_objects=True) courses_historical = s3.get_keys_with_prefix( app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses', full_objects=True) enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily + '/enrollments', full_objects=True) enrollments_historical = s3.get_keys_with_prefix( app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments', full_objects=True) def deduplicate(prefix, s3list): filename_map = {} for s3obj in s3list: m = re.match(r'.+/(.+\.gz)', s3obj['Key']) if m: filename_map[m[1]] = s3obj for term_id in reverse_term_ids(include_future_terms=True): filename = f'{prefix}-{term_id}.gz' if filename not in filename_map: raise BackgroundJobError( f'Expected filename {filename} not found in S3, aborting' ) return list(filename_map.values()) all_courses = deduplicate('courses', courses_daily + courses_historical) all_enrollments = deduplicate( 'enrollments', enrollments_daily + enrollments_historical) def to_manifest_entry(_object): return { 'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}", 'meta': { 'content_length': _object['Size'] }, } def to_manifest(objects): return { 'entries': [to_manifest_entry(o) for o in objects], } courses_manifest = json.dumps(to_manifest(all_courses)) enrollments_manifest = json.dumps(to_manifest(all_enrollments)) courses_result = s3.upload_data( courses_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json') enrollments_result = s3.upload_data( enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/enrollments.json') return courses_result and enrollments_result
def run(self, load_mode='new'): all_sids = [row['sid'] for row in get_all_student_ids()] previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()} if load_mode == 'new': sids = list(set(all_sids).difference(previous_backfills)) elif load_mode == 'batch': new_sids = list(set(all_sids).difference(previous_backfills)) limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids) if limit > 0: oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)] sids = new_sids + oldest_backfills else: sids = new_sids elif load_mode == 'all': sids = all_sids app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...') rows = { 'term_gpas': [], 'last_registrations': [], 'api_demographics': [], } successes, failures = self.get_registration_data_per_sids(rows, sids) if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import registration histories: aborting job.') for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv' app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.student_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.student_{table_key}); INSERT INTO {student_schema}.student_{table_key} (SELECT * FROM {student_schema}_staging.student_{table_key}); TRUNCATE TABLE {student_schema}_staging.student_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error inserting staging entries into destination: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Failed to refresh RDS indexes.') update_registration_import_status(successes, failures) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self): app.logger.info( 'ASC import: Fetch team and student athlete data from ASC API') api_results = get_asc_feed() if 'error' in api_results: raise BackgroundJobError( 'ASC import: Error from external API: {}'.format( api_results['error'])) elif not api_results: raise BackgroundJobError('ASC import: API returned zero students') sync_date = api_results[0]['SyncDate'] if sync_date != api_results[-1]['SyncDate']: raise BackgroundJobError( f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}' ) rows = [] for r in api_results: if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']: asc_code = r['SportCodeCore'] if asc_code in SPORT_TRANSLATIONS: group_code = r['SportCode'] data = [ r['SID'], str(r.get('ActiveYN', 'No') == 'Yes'), str(r.get('IntensiveYN', 'No') == 'Yes'), r.get('SportStatus', ''), group_code, _unambiguous_group_name(r['Sport'], group_code), SPORT_TRANSLATIONS[asc_code], r['SportCore'], ] rows.append(encoded_tsv_row(data)) else: sid = r['SID'] app.logger.error( f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}' ) s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv' if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Copy data in S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.students; COPY {redshift_schema_asc}.students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') status = { 'this_sync_date': sync_date, 'api_results_count': len(api_results), } app.logger.info( f'ASC import: Successfully completed import job: {str(status)}') return status
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): # For tables other than requests, sync all snapshots. # For the requests table, sync snapshots that are partial or later than the configured cutoff date. def after_cutoff_date(url): match = re.search('requests/(20\d{6})', url) return match is not None and ( match[1] >= app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE']) return snapshot['table'] != 'requests' or snapshot[ 'partial'] is True or after_cutoff_date(snapshot['url']) snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) if snapshot['table'] == 'requests': key_components = [ berkeley.s3_canvas_data_path_current_term(), snapshot['table'], snapshot['filename'] ] else: key_components = [ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = berkeley.s3_canvas_data_path_current_term( ) + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. # # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less # likely to be triggered.) if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting import of historical registration data for {len(sids)} students...' ) redshift.execute('VACUUM; ANALYZE;') rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.get_registration_data_per_sids( rows, sids, include_demographics=False) for key in rows.keys(): if len(rows[key]) > 0: s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.' ) if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( f'Error during S3 upload: {s3_key}. Aborting job.') staging_table = f'{student_schema()}_staging.hist_enr_{key}' if not redshift.execute(f'TRUNCATE {staging_table}'): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') app.logger.info( f'Populate {staging_table} (Redshift table) with s3:{s3_key}' ) if not redshift.copy_tsv_from_s3(staging_table, s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') app.logger.info( f'Insert student data into {student_schema()}.hist_enr_{key}' ) staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key}); INSERT INTO {student_schema}.hist_enr_{table_key} (SELECT * FROM {student_schema}_staging.hist_enr_{table_key}); TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) redshift.execute('VACUUM; ANALYZE;') return ( f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.' )
def run(self, csids=None): if not csids: all_sids = get_all_student_ids() if all_sids: csids = [row['sid'] for row in all_sids] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append(encoded_tsv_row([csid, json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress ( sid VARCHAR, feed VARCHAR(MAX) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/degree_progress'; DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress; """, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') redshift.execute('VACUUM; ANALYZE;') return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def refresh_all_from_staging(tables): with redshift.transaction() as transaction: for table in tables: refresh_from_staging(table, None, None, transaction) if not transaction.commit(): raise BackgroundJobError(f'Final transaction commit failed for {redshift_schema()}.')
def run(self): # Retrieve latest schema definitions from Canvas data API response = canvas_data.get_canvas_data_schema() external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] redshift_iam_role = app.config['REDSHIFT_IAM_ROLE'] canvas_schema = [] # Parse and isolate table and column details for key, value in response['schema'].items(): for column in value['columns']: # Not every column has description and length. description = None if 'description' in column: description = column['description'] length = None if 'length' in column: length = column['length'] canvas_schema.append([ value['tableName'], column['name'], column['type'], description, length, ]) # Create a dataframe schema_df = pd.DataFrame(canvas_schema) schema_df.columns = [ 'table_name', 'column_name', 'column_type', 'column_description', 'column_length', ] # The schema definitions received from Canvas are Redshift compliant. We update # cetain column types to match Glue and Spectrum data types. schema_df['glue_type'] = schema_df['column_type'].replace({ 'enum': 'varchar', 'guid': 'varchar', 'text': 'varchar(max)', 'date': 'timestamp', 'datetime': 'timestamp', }) schema_df['transformed_column_name'] = schema_df[ 'column_name'].replace({ 'default': '"default"', 'percent': '"percent"', }) # Create Hive compliant storage descriptors canvas_external_catalog_ddl = self.generate_external_catalog( external_schema, schema_df) # Clean up and recreate refreshed tables on Glue using Spectrum redshift.drop_external_schema(external_schema) redshift.create_external_schema(external_schema, redshift_iam_role) if redshift.execute_ddl_script(canvas_external_catalog_ddl): app.logger.info('Canvas schema creation job completed.') else: app.logger.error('Canvas schema creation job failed.') raise BackgroundJobError('Canvas schema creation job failed.') self.verify_external_data_catalog() return 'Canvas external schema created and verified.'
def run(self): app.logger.info('Start generating canvas caliper analytics') redshift_schema_caliper_analytics = app.config[ 'REDSHIFT_SCHEMA_CALIPER'] redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS'] canvas_caliper_explode_table = 'canvas_caliper_explode' # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch # may have been delivered before last midnight UTC. s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path() if not s3.get_keys_with_prefix(s3_caliper_daily_path): s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path( datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_caliper_daily_path): raise BackgroundJobError('No timely S3 Caliper extracts found') else: app.logger.info( 'Falling back S3 Caliper extracts for yesterday') s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path) resolved_ddl_caliper_explode = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', canvas_caliper_explode_table=canvas_caliper_explode_table, loch_s3_caliper_explode_url=s3_caliper_daily_url, ) redshift.drop_external_schema(redshift_schema_lrs_external) if redshift.execute_ddl_script(resolved_ddl_caliper_explode): app.logger.info( 'Caliper explode schema and table successfully created.') else: raise BackgroundJobError( 'Caliper explode schema and table creation failed.') # Sanity-check event times from the latest Caliper batch against previously transformed event times. def datetime_from_query(query): response = redshift.fetch(query) timestamp = response and response[0] and response[0].get( 'timestamp') if not timestamp: raise BackgroundJobError( f'Timestamp query failed to return data for comparison; aborting job: {query}' ) if isinstance(timestamp, str): timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') return timestamp earliest_untransformed = datetime_from_query( f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}', ) latest_transformed = datetime_from_query( f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests', ) if not earliest_untransformed or not latest_transformed: return False timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds() lower_bound_tolerance, upper_bound_tolerance = app.config[ 'LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE'] if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance: raise BackgroundJobError( f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, ' f'earliest untransformed {earliest_untransformed}', ) resolved_ddl_caliper_analytics = resolve_sql_template( 'generate_caliper_analytics.template.sql') if redshift.execute_ddl_script(resolved_ddl_caliper_analytics): return 'Caliper analytics tables successfully created.' else: raise BackgroundJobError( 'Caliper analytics tables creation failed.')
def create_historical_tables(self, external_schema): resolved_ddl = resolve_sql_template('create_sis_advising_notes_historical_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS Advising Notes schema creation job failed to load historical data.')
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = enrollment.get('user_id') last_activity_at = enrollment.get('last_activity_at') or '' rows.append( encoded_tsv_row([ course_id, user_id, term_id, last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments ( course_id VARCHAR, user_id VARCHAR, term_id VARCHAR, last_activity_at TIMESTAMP, feed VARCHAR ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments'; DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments); DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def create_indexes(self): resolved_ddl = resolve_sql_template('index_e_i_advising_notes.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created E&I Advising Notes RDS indexes.') else: raise BackgroundJobError('E&I Advising Notes schema creation job failed to create indexes.')
def run(self, truncate_lrs=True): app.logger.info('Starting DMS replication task...') task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID'] self.transient_bucket = app.config[ 'LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'] self.transient_path = app.config[ 'LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH'] self.delete_old_incrementals() response = dms.start_replication_task(task_id) if not response: raise BackgroundJobError( 'Failed to start DMS replication task (response={response}).') while True: response = dms.get_replication_task(task_id) if response.get('Status') == 'stopped': if response.get( 'StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED': app.logger.info('DMS replication task completed') break else: raise BackgroundJobError( f'Replication task stopped for unexpected reason: {response}' ) sleep(10) lrs_response = lrs.fetch('select count(*) from statements') if lrs_response: self.lrs_statement_count = lrs_response[0][0] else: raise BackgroundJobError( 'Failed to retrieve LRS statements for comparison.') transient_keys = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket) if not transient_keys: raise BackgroundJobError( 'Could not retrieve S3 keys from transient bucket.') self.verify_and_unload_transient() timestamp_path = localize_datetime( datetime.now()).strftime('%Y/%m/%d/%H%M%S') destination_path = app.config[ 'LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path for destination_bucket in app.config[ 'LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']: self.migrate_transient_to_destination( transient_keys, destination_bucket, destination_path, ) if truncate_lrs: if lrs.execute('TRUNCATE statements'): app.logger.info('Truncated incremental LRS table.') else: raise BackgroundJobError( 'Failed to truncate incremental LRS table.') return ( f'Migrated {self.lrs_statement_count} statements to S3' f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})" )
def generate_feeds(self): # Translation between canvas_user_id and UID/SID is needed to merge Canvas analytics data and SIS enrollment-based data. advisees_by_canvas_id = {} advisees_by_sid = {} self.successes = [] self.failures = [] profile_tables = self.generate_student_profile_tables( advisees_by_canvas_id, advisees_by_sid) if not profile_tables: raise BackgroundJobError( 'Failed to generate student profile tables.') feed_path = app.config['LOCH_S3_BOAC_ANALYTICS_DATA_PATH'] + '/feeds/' s3.upload_json(advisees_by_canvas_id, feed_path + 'advisees_by_canvas_id.json') upload_student_term_maps(advisees_by_sid) # Avoid processing Canvas analytics data for future terms and pre-CS terms. for term_id in (future_term_ids() + legacy_term_ids()): enrollment_term_map = s3.get_object_json( feed_path + f'enrollment_term_map_{term_id}.json') if enrollment_term_map: GenerateMergedEnrollmentTerm().refresh_student_enrollment_term( term_id, enrollment_term_map) canvas_integrated_term_ids = reverse_term_ids() app.logger.info( f'Will queue analytics generation for {len(canvas_integrated_term_ids)} terms on worker nodes.' ) result = queue_merged_enrollment_term_jobs(self.job_id, canvas_integrated_term_ids) if not result: raise BackgroundJobError('Failed to queue enrollment term jobs.') refresh_all_from_staging(profile_tables) self.update_redshift_academic_standing() self.update_rds_profile_indexes() app.logger.info( 'Profile generation complete; waiting for enrollment term generation to finish.' ) while True: sleep(1) enrollment_results = get_merged_enrollment_term_job_status( self.job_id) if not enrollment_results: raise BackgroundJobError('Failed to refresh RDS indexes.') any_pending_job = next( (row for row in enrollment_results if row['status'] == 'created' or row['status'] == 'started'), None) if not any_pending_job: break app.logger.info('Exporting analytics data for archival purposes.') unload_enrollment_terms([current_term_id(), future_term_id()]) app.logger.info('Refreshing enrollment terms in RDS.') with rds.transaction() as transaction: if self.refresh_rds_enrollment_terms(None, transaction): transaction.commit() app.logger.info('Refreshed RDS enrollment terms.') else: transaction.rollback() raise BackgroundJobError( 'Failed to refresh RDS enrollment terms.') status_string = f'Generated merged profiles ({len(self.successes)} successes, {len(self.failures)} failures).' errored = False for row in enrollment_results: status_string += f" {row['details']}" if row['status'] == 'error': errored = True truncate_staging_table('student_enrollment_terms') if errored: raise BackgroundJobError(status_string) else: return status_string
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): return snapshot[ 'table'] == 'requests' and snapshot['partial'] is False snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) key_components = [ app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'], snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = app.config[ 'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
def create_rds_indexes(self): resolved_ddl = resolve_sql_template('index_advisors.template.sql') if rds.execute(resolved_ddl): app.logger.info('Created RDS indexes for advisor schema.') else: raise BackgroundJobError('Failed to create RDS indexes for advisor schema.')
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting registrations import job for {len(sids)} non-advisees...' ) rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.load_concurrently(rows, sids) if len(successes) > 0: for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( 'Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3( f'{self.redshift_schema}_staging.hist_enr_{key}', s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key}); INSERT INTO {redshift_schema_student}.hist_enr_{table_key} (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key}); TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key}; """, table_key=key, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )