def import_advisor_attributes(self): csid_results = redshift.fetch( resolve_sql_template_string( 'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students' ), ) csids = [r['advisor_sid'] for r in csid_results] all_attributes = calnet.client(app).search_csids(csids) if len(csids) != len(all_attributes): ldap_csids = [person['csid'] for person in all_attributes] missing = set(csids) - set(ldap_csids) app.logger.warning( f'Looked for {len(csids)} advisor CSIDs but only found {len(all_attributes)} : missing {missing}' ) advisor_rows = [] total_count = len(all_attributes) for index, a in enumerate(all_attributes): sid = a['csid'] app.logger.info( f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {total_count})' ) first_name, last_name = calnet.split_sortable_name(a) data = [ a['uid'], sid, first_name, last_name, a['title'], calnet.get_dept_code(a), a['email'], a['campus_email'], ] advisor_rows.append(encoded_tsv_row(data)) s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv' app.logger.info( f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(advisor_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes; COPY {redshift_schema_advisor_internal}.advisor_attributes FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def import_advisor_attributes(self): if self.feature_flag_edl: sql = resolve_sql_template_string(""" SELECT DISTINCT advisor_id FROM {redshift_schema_edl_external}.student_advisor_data WHERE academic_career_cd = 'UGRD' AND advisor_id ~ '[0-9]+' """) advisor_ids = [row['advisor_id'] for row in redshift.fetch(sql)] else: sql = resolve_sql_template_string( 'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students' ) advisor_ids = [row['advisor_sid'] for row in redshift.fetch(sql)] return _import_calnet_attributes(advisor_ids)
def run(self): app.logger.info('Starting BOA manually added advisees import job...') feed = get_manually_added_advisees() if feed.get('error'): raise BackgroundJobError('Error on S3 upload: aborting job.') rows = [advisee['sid'].encode() for advisee in feed.get('feed')] s3_key = f'{get_s3_boa_api_daily_path()}/manually-added-advisees/manually-added-advisees.tsv' if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Copying data from S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_advisee}.non_current_students; COPY {redshift_schema_advisee}.non_current_students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') status = f'Imported {len(rows)} non-current students.' app.logger.info( f'BOA manually added advisees import job completed: {status}') return status
def run(self, csids=None): if app.config['STUDENT_V1_API_PREFERRED']: return self.run_v1(csids) if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info(f'Starting SIS student API import job for {len(csids)} students...') rows, failure_count = self.load_concurrently(csids) if (len(rows) == 0) and (failure_count > 0): raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.') s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
def run(self, sids=None): if not sids: sids = [row['sid'] for row in get_unfetched_non_advisees()] app.logger.info( f'Starting SIS student API import job for {len(sids)} non-advisees...' ) with tempfile.TemporaryFile() as feed_file: saved_sids, failure_count = self.load_concurrently(sids, feed_file) if saved_sids: student_schema.truncate_staging_table( 'sis_api_profiles_hist_enr') student_schema.write_file_to_staging( 'sis_api_profiles_hist_enr', feed_file, len(saved_sids)) if saved_sids: staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles_hist_enr WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr); INSERT INTO {redshift_schema_student}.sis_api_profiles_hist_enr (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles_hist_enr; """, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') return f'SIS student API non-advisee import job completed: {len(saved_sids)} succeeded, {failure_count} failed.'
def student_tables(app): """Use Postgres to mock the Redshift student schemas on local test runs.""" from nessie.externals import rds, redshift from nessie.lib.util import resolve_sql_template_string, resolve_sql_template rds.execute(resolve_sql_template('create_rds_indexes.template.sql')) fixture_path = f"{app.config['BASE_DIR']}/fixtures" with open(f'{fixture_path}/students.sql', 'r') as sql_file: student_sql = sql_file.read() params = {} for key in [ 'sis_api_drops_and_midterms_11667051_2178', 'sis_degree_progress_11667051', 'sis_student_api_11667051', 'sis_student_api_2345678901', ]: with open(f'{fixture_path}/{key}.json', 'r') as f: feed = f.read() if key.startswith('sis_student_api'): feed = json.dumps( json.loads(feed)['apiResponse']['response']['any'] ['students'][0]) params[key] = feed redshift.execute(resolve_sql_template_string(student_sql), params=params) yield for schema in ['asc_test', 'coe_test', 'student_test']: rds.execute(f'DROP SCHEMA {schema} CASCADE') redshift.execute(f'DROP SCHEMA {schema} CASCADE')
def update_merged_feed_status(term_id, successes, failures): term_id = term_id or 'all' redshift.execute( 'DELETE FROM {schema}.merged_feed_status WHERE sid = ANY(%s) AND term_id = %s', schema=_schema(), params=((successes + failures), term_id), ) now = datetime.utcnow().isoformat() success_records = ['\t'.join([sid, term_id, 'success', now]) for sid in successes] failure_records = ['\t'.join([sid, term_id, 'failure', now]) for sid in failures] rows = success_records + failure_records s3_key = f'{get_s3_sis_api_daily_path()}/merged_feed_status.tsv' if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error uploading merged feed status updates to S3.') return query = resolve_sql_template_string( """ COPY {redshift_schema_metadata}.merged_feed_status FROM '{loch_s3_sis_api_data_path}/merged_feed_status.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS'; """ ) if not redshift.execute(query): app.logger.error('Error copying merged feed status updates to Redshift.')
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS student API import job for {len(csids)} students...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})' ) feed = sis_student_api.get_student(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) else: failure_count += 1 app.logger.error( f'SIS student API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_profiles', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles); INSERT INTO {redshift_schema_student}.sis_api_profiles (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'
def update_redshift(self, term_ids, transaction): if not transaction.execute( f'DELETE FROM {self.destination_schema}.sis_terms WHERE term_id = ANY(%s)', params=(term_ids, ), ): return False template = """COPY {redshift_schema_sis_internal}.sis_terms FROM '{loch_s3_sis_api_data_path}/terms.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t';""" if not transaction.execute(resolve_sql_template_string(template)): return False return True
def unload_enrollment_terms(term_ids): query = resolve_sql_template_string( """ UNLOAD ('SELECT *, GETDATE() AS analytics_generated_at FROM {schema}.student_enrollment_terms WHERE term_id=ANY(\''{{{term_ids}}}\'')') TO '{loch_s3_boac_analytics_incremental_path}/student_enrollment_terms' IAM_ROLE '{redshift_iam_role}' ENCRYPTED DELIMITER AS '\\t' ALLOWOVERWRITE GZIP; """, schema=student_schema(), term_ids=','.join(term_ids), ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift unload: aborting job.')
def upload_to_staging(self, table): rows = self.rows[table] s3_key = f'{get_s3_sis_api_daily_path()}/staging_{table}.tsv' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/staging_{table}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=self.staging_schema, table=table, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False
def upload_to_staging(table, rows, term_id=None): if term_id: tsv_filename = f'staging_{table}_{term_id}.tsv' else: tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def upload_file_to_staging(table, term_file, row_count, term_id): tsv_filename = f'staging_{table}_{term_id}.tsv' if term_id else f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}') # Be kind; rewind term_file.seek(0) if not s3.upload_data(term_file, s3_key): raise BackgroundJobError( f'Failed upload {row_count} records to s3:{s3_key}. Aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append('\t'.join([str(csid), json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_degree_progress', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_profiles; """, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self, csids=None): if not csids: all_sids = get_all_student_ids() if all_sids: csids = [row['sid'] for row in all_sids] app.logger.info( f'Starting SIS degree progress API import job for {len(csids)} students...' ) rows = [] success_count = 0 no_information_count = 0 failure_count = 0 index = 1 # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration. # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an # alternative way to filter out non-UGRD students? for csid in csids: app.logger.info( f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})' ) feed = sis_degree_progress_api.parsed_degree_progress(csid) if feed: success_count += 1 rows.append(encoded_tsv_row([csid, json.dumps(feed)])) elif feed == {}: app.logger.info( f'No degree progress information found for SID {csid}.') no_information_count += 1 else: failure_count += 1 app.logger.error( f'SIS get_degree_progress failed for SID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress ( sid VARCHAR, feed VARCHAR(MAX) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/degree_progress'; DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress); DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.sis_api_degree_progress WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress); INSERT INTO {redshift_schema_student}.sis_api_degree_progress (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress); TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress; """, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') redshift.execute('VACUUM; ANALYZE;') return ( f'SIS degree progress API import job completed: {success_count} succeeded, ' f'{no_information_count} returned no information, {failure_count} failed.' )
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = str(enrollment.get('user_id')) last_activity_at = str( enrollment.get('last_activity_at') or '') rows.append('\t'.join([ str(course_id), user_id, str(term_id), last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; COPY {redshift_schema_student}_staging.canvas_api_enrollments FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t' TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ'; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def run(self): app.logger.info('Starting ASC profile generation job...') asc_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)', schema=asc_schema_identifier, ) profile_rows = [] sids_for_inactive_deletion = [] for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')): rows_for_student = list(rows_for_student) # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not # a team membership, a bit of brutal simplification is needed. Students who are active in at least # one sport have inactive team memberships dropped. any_active_athletics = reduce( operator.or_, [r['active'] for r in rows_for_student], False) if any_active_athletics: rows_for_student = [r for r in rows_for_student if r['active']] sids_for_inactive_deletion.append(sid) athletics_profile = { 'athletics': [], 'inIntensiveCohort': rows_for_student[0]['intensive'], 'isActiveAsc': rows_for_student[0]['active'], 'statusAsc': rows_for_student[0]['status_asc'], } for row in rows_for_student: athletics_profile['athletics'].append({ 'groupCode': row['group_code'], 'groupName': row['group_name'], 'name': row['group_name'], 'teamCode': row['team_code'], 'teamName': row['team_name'], }) profile_rows.append( encoded_tsv_row([sid, json.dumps(athletics_profile)])) s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.student_profiles; COPY {redshift_schema_asc}.student_profiles FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): app.logger.error('Error on Redshift copy: aborting job.') return False with rds.transaction() as transaction: if self.refresh_rds_indexes(asc_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') if sids_for_inactive_deletion: redshift.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) rds.execute( f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)', params=(sids_for_inactive_deletion, ), ) return 'ASC profile generation complete.'
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'
def run(self, csids=None, term_id=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] if not term_id: term_id = current_term_id() app.logger.info( f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...' ) rows = [] success_count = 0 no_enrollments_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})' ) feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id) if feed: success_count += 1 rows.append('\t'.join( [str(csid), str(term_id), json.dumps(feed)])) elif feed is False: app.logger.info( f'SID {csid} returned no enrollments for term {term_id}.') no_enrollments_count += 1 else: failure_count += 1 app.logger.error( f'SIS enrollments API import failed for CSID {csid}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'" ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.sis_api_drops_and_midterms', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms WHERE term_id = '{term_id}' AND sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute(staging_to_destination_query): app.logger.error('Error on Redshift copy: aborting job.') return False return ( f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{no_enrollments_count} returned no enrollments, {failure_count} failed.' )
def run(self, csids=None): if not csids: csids = [row['sid'] for row in get_all_student_ids()] app.logger.info( f'Starting term GPA import job for {len(csids)} students...') rows = [] success_count = 0 no_registrations_count = 0 failure_count = 0 index = 1 for csid in csids: app.logger.info( f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})' ) feed = sis_student_api.get_term_gpas(csid) if feed: success_count += 1 for term_id, term_data in feed.items(): rows.append('\t'.join([ str(csid), str(term_id), str(term_data.get('gpa') or '0'), str(term_data.get('unitsTakenForGpa') or '0') ])) elif feed == {}: app.logger.info(f'No registrations found for SID {csid}.') no_registrations_count += 1 else: failure_count += 1 app.logger.error(f'Term GPA import failed for SID {csid}.') index += 1 if success_count == 0: app.logger.error('Failed to import term GPAs: aborting job.') return False s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.destination_schema}_staging.student_term_gpas' ): app.logger.error( 'Error truncating old staging rows: aborting job.') return False if not redshift.copy_tsv_from_s3( f'{self.destination_schema}_staging.student_term_gpas', s3_key): app.logger.error('Error on Redshift copy: aborting job.') return False staging_to_destination_query = resolve_sql_template_string(""" DELETE FROM {redshift_schema_student}.student_term_gpas WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas); INSERT INTO {redshift_schema_student}.student_term_gpas (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas); TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas; """) if not redshift.execute(staging_to_destination_query): app.logger.error( 'Error inserting staging entries into destination: aborting job.' ) return False with rds.transaction() as transaction: if self.refresh_rds_indexes(csids, rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Failed to refresh RDS indexes.') return False return ( f'Term GPA import completed: {success_count} succeeded, ' f'{no_registrations_count} returned no registrations, {failure_count} failed.' )
def run(self): app.logger.info( 'ASC import: Fetch team and student athlete data from ASC API') api_results = get_asc_feed() if 'error' in api_results: raise BackgroundJobError( 'ASC import: Error from external API: {}'.format( api_results['error'])) elif not api_results: raise BackgroundJobError('ASC import: API returned zero students') sync_date = api_results[0]['SyncDate'] if sync_date != api_results[-1]['SyncDate']: raise BackgroundJobError( f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}' ) rows = [] for r in api_results: if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']: asc_code = r['SportCodeCore'] if asc_code in SPORT_TRANSLATIONS: group_code = r['SportCode'] data = [ r['SID'], str(r.get('ActiveYN', 'No') == 'Yes'), str(r.get('IntensiveYN', 'No') == 'Yes'), r.get('SportStatus', ''), group_code, _unambiguous_group_name(r['Sport'], group_code), SPORT_TRANSLATIONS[asc_code], r['SportCore'], ] rows.append(encoded_tsv_row(data)) else: sid = r['SID'] app.logger.error( f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}' ) s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv' if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Copy data in S3 file to Redshift...') query = resolve_sql_template_string( """ TRUNCATE {redshift_schema_asc}.students; COPY {redshift_schema_asc}.students FROM 's3://{s3_bucket}/{s3_key}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, s3_bucket=app.config['LOCH_S3_BUCKET'], s3_key=s3_key, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') status = { 'this_sync_date': sync_date, 'api_results_count': len(api_results), } app.logger.info( f'ASC import: Successfully completed import job: {str(status)}') return status
def run(self, load_mode='new'): all_sids = [row['sid'] for row in get_all_student_ids()] previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()} if load_mode == 'new': sids = list(set(all_sids).difference(previous_backfills)) elif load_mode == 'batch': new_sids = list(set(all_sids).difference(previous_backfills)) limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids) if limit > 0: oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)] sids = new_sids + oldest_backfills else: sids = new_sids elif load_mode == 'all': sids = all_sids app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...') rows = { 'term_gpas': [], 'last_registrations': [], 'api_demographics': [], } successes, failures = self.get_registration_data_per_sids(rows, sids) if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0): raise BackgroundJobError('Failed to import registration histories: aborting job.') for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv' app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'): raise BackgroundJobError('Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.student_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.student_{table_key}); INSERT INTO {student_schema}.student_{table_key} (SELECT * FROM {student_schema}_staging.student_{table_key}); TRUNCATE TABLE {student_schema}_staging.student_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError('Error inserting staging entries into destination: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Failed to refresh RDS indexes.') update_registration_import_status(successes, failures) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self, term_id=None): if not term_id: term_id = current_term_id() canvas_course_ids = [ row['canvas_course_id'] for row in get_enrolled_canvas_sites_for_term(term_id) ] app.logger.info( f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...' ) rows = [] success_count = 0 failure_count = 0 index = 1 for course_id in canvas_course_ids: app.logger.info( f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})' ) feed = canvas_api.get_course_enrollments(course_id) if feed: success_count += 1 for enrollment in feed: user_id = enrollment.get('user_id') last_activity_at = enrollment.get('last_activity_at') or '' rows.append( encoded_tsv_row([ course_id, user_id, term_id, last_activity_at, json.dumps(enrollment) ])) else: failure_count += 1 app.logger.error( f'Canvas enrollments API import failed for course id {course_id}.' ) index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv' app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog DATABASE '{redshift_schema_student}_staging_ext_tmp' IAM_ROLE '{redshift_iam_role}' CREATE EXTERNAL DATABASE IF NOT EXISTS; CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments ( course_id VARCHAR, user_id VARCHAR, term_id VARCHAR, last_activity_at TIMESTAMP, feed VARCHAR ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS TEXTFILE LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments'; DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments); DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments; DROP SCHEMA {redshift_schema_student}_staging_ext_tmp; DELETE FROM {redshift_schema_student}.canvas_api_enrollments WHERE term_id = '{term_id}' AND course_id IN (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); INSERT INTO {redshift_schema_student}.canvas_api_enrollments (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'); DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}'; """, term_id=term_id, ) if not redshift.execute_ddl_script(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') return ( f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, ' f'{failure_count} failed.')
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting registrations import job for {len(sids)} non-advisees...' ) rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.load_concurrently(rows, sids) if len(successes) > 0: for key in rows.keys(): s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Will stash {len(successes)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( 'Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.execute( f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}' ): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') if not redshift.copy_tsv_from_s3( f'{self.redshift_schema}_staging.hist_enr_{key}', s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {redshift_schema_student}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key}); INSERT INTO {redshift_schema_student}.hist_enr_{table_key} (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key}); TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key}; """, table_key=key, ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) return ( f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.' )
def run(self, load_mode='batch'): new_sids = [ row['sid'] for row in get_non_advisees_without_registration_imports() ] # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not # finish successfully without interfering with other work. Therefore the default approach is to apply a strict # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed. # # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less # likely to be triggered.) if load_mode == 'new': sids = new_sids elif load_mode == 'batch': max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE'] if max_batch >= len(new_sids): sids = new_sids else: sids = new_sids[0:(max_batch)] app.logger.info( f'Starting import of historical registration data for {len(sids)} students...' ) redshift.execute('VACUUM; ANALYZE;') rows = { 'term_gpas': [], 'last_registrations': [], } successes, failures = self.get_registration_data_per_sids( rows, sids, include_demographics=False) for key in rows.keys(): if len(rows[key]) > 0: s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv' app.logger.info( f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.' ) if not s3.upload_tsv_rows(rows[key], s3_key): raise BackgroundJobError( f'Error during S3 upload: {s3_key}. Aborting job.') staging_table = f'{student_schema()}_staging.hist_enr_{key}' if not redshift.execute(f'TRUNCATE {staging_table}'): raise BackgroundJobError( 'Error truncating old staging rows: aborting job.') app.logger.info( f'Populate {staging_table} (Redshift table) with s3:{s3_key}' ) if not redshift.copy_tsv_from_s3(staging_table, s3_key): raise BackgroundJobError( 'Error on Redshift copy: aborting job.') app.logger.info( f'Insert student data into {student_schema()}.hist_enr_{key}' ) staging_to_destination_query = resolve_sql_template_string( """ DELETE FROM {student_schema}.hist_enr_{table_key} WHERE sid IN (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key}); INSERT INTO {student_schema}.hist_enr_{table_key} (SELECT * FROM {student_schema}_staging.hist_enr_{table_key}); TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key}; """, table_key=key, student_schema=student_schema(), ) if not redshift.execute(staging_to_destination_query): raise BackgroundJobError( 'Error inserting staging entries into destination: aborting job.' ) redshift.execute('VACUUM; ANALYZE;') return ( f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.' )