def generate_student_profile_table(self, non_advisee_sids): profile_count = 0 with tempfile.TemporaryFile() as feed_file, tempfile.TemporaryFile( ) as index_file, tempfile.TemporaryFile() as names_file: tables = { 'student_profiles_hist_enr': feed_file, 'student_profile_index_hist_enr': index_file, 'student_names_hist_enr': names_file, } # Work in batches so as not to overload memory. for i in range(0, len(non_advisee_sids), BATCH_QUERY_MAXIMUM): sids = non_advisee_sids[i:i + BATCH_QUERY_MAXIMUM] profile_count += self.collect_merged_profiles( sids, feed_file, index_file, names_file) if profile_count: with redshift.transaction() as transaction: for table_name, data in tables.items(): student_schema.truncate_staging_table(table_name) student_schema.write_file_to_staging( table_name, data, profile_count) student_schema.refresh_from_staging( table_name, None, non_advisee_sids, transaction, ) app.logger.info('Non-advisee profile generation complete.') return profile_count
def refresh_all_from_staging(tables): with redshift.transaction() as transaction: for table in tables: refresh_from_staging(table, None, None, transaction) if not transaction.commit(): raise BackgroundJobError( f'Final transaction commit failed for {student_schema()}.')
def generate_student_enrollments_table(self, non_advisee_sids): # Split all S3/Redshift operations by term in hope of not overloading memory or other resources. # (Using finer-grained batches of SIDs would probably involve replacing the staging table by a Spectrum # external table.) total_count = 0 table_name = 'student_enrollment_terms_hist_enr' student_schema.truncate_staging_table(table_name) for term_id in reverse_term_ids(include_future_terms=True, include_legacy_terms=True): with tempfile.TemporaryFile() as feed_file: term_count = self.collect_merged_enrollments( non_advisee_sids, term_id, feed_file) if term_count: student_schema.write_file_to_staging( table_name, feed_file, term_count, term_id, ) if term_count: with redshift.transaction() as transaction: student_schema.refresh_from_staging( table_name, term_id, non_advisee_sids, transaction, ) total_count += term_count app.logger.info('Non-advisee term enrollment generation complete.') return total_count
def generate_student_enrollments_table(self, non_advisee_sids): table_name = 'student_enrollment_terms_hist_enr' truncate_staging_table(table_name) with tempfile.TemporaryFile() as feed_file: row_count = self.generate_term_feeds(non_advisee_sids, feed_file) if row_count: write_file_to_staging(table_name, feed_file, row_count) with redshift.transaction() as transaction: refresh_from_staging( table_name, term_id=None, sids=non_advisee_sids, transaction=transaction, ) app.logger.info('Non-advisee term enrollment generation complete.') return row_count
def refresh_student_enrollment_term(self, term_id, enrollment_term_map): with tempfile.TemporaryFile() as enrollment_term_file: for (sid, sid_term_feed) in enrollment_term_map.items(): enrollment_term_file.write( encoded_tsv_row([sid, term_id, json.dumps(sid_term_feed)]) + b'\n') drop_staged_enrollment_term(term_id) write_file_to_staging('student_enrollment_terms', enrollment_term_file, len(enrollment_term_map), term_id) with redshift.transaction() as transaction: refresh_from_staging('student_enrollment_terms', term_id, None, transaction, truncate_staging=False) if not transaction.commit(): raise BackgroundJobError( f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).' )
def run(self, term_ids=None): if not term_ids: term_ids = reverse_term_ids() app.logger.info( f'Starting SIS terms API import job for {len(term_ids)} terms...') rows = [] success_count = 0 failure_count = 0 index = 1 for term_id in term_ids: app.logger.info( f'Fetching SIS terms API for term id {term_id} ({index} of {len(term_ids)})' ) feed = sis_terms_api.get_term(term_id) if feed: success_count += 1 for academic_career_term in feed: for session in academic_career_term.get('sessions', []): rows.append( '\t'.join([ academic_career_term.get('id', ''), academic_career_term.get('name', ''), academic_career_term.get('academicCareer', {}).get('code', ''), academic_career_term.get('beginDate', ''), academic_career_term.get('endDate', ''), session.get('id', ''), session.get('name', ''), session.get('beginDate', ''), session.get('endDate', ''), ]), ) else: failure_count += 1 app.logger.error( f'SIS terms API import failed for term id {term_id}.') index += 1 s3_key = f'{get_s3_sis_api_daily_path()}/terms.tsv' app.logger.info( f'Will stash {len(rows)} rows from {success_count} feeds in S3: {s3_key}' ) if not s3.upload_data('\n'.join(rows), s3_key): app.logger.error('Error on S3 upload: aborting job.') return False app.logger.info('Will copy S3 feeds into Redshift...') with redshift.transaction() as transaction: if self.update_redshift(term_ids, transaction): transaction.commit() app.logger.info('Updated Redshift.') else: transaction.rollback() app.logger.error('Failed to update Redshift.') return False with rds.transaction() as transaction: if self.update_rds(rows, term_ids, transaction): transaction.commit() app.logger.info('Updated RDS.') else: transaction.rollback() app.logger.error('Failed to update RDS.') return False return f'SIS terms API import job completed: {success_count} succeeded, {failure_count} failed.'
def generate_feeds(self, term_id=None, sids=None): """Loop through all records stored in the Calnet external schema and write merged student data to the internal student schema.""" calnet_profiles = self.fetch_calnet_profiles(sids) # Jobs targeted toward a specific sid set (such as backfills) may return no CalNet profiles. Warn, don't error. if not calnet_profiles: app.logger.warn( f'No CalNet profiles returned, aborting job. (sids={sids})') return False # Jobs for non-current terms generate enrollment feeds only. if term_id and term_id != berkeley.current_term_id(): tables = ['student_enrollment_terms'] else: tables = [ 'student_profiles', 'student_academic_status', 'student_majors', 'student_enrollment_terms', 'student_holds' ] # In-memory storage for generated feeds prior to TSV output. self.rows = { 'student_profiles': [], 'student_academic_status': [], 'student_majors': [], 'student_enrollment_terms': [], 'student_holds': [], } # Track the results of course-level queries to avoid requerying. self.canvas_site_map = {} # Remove any old data from staging tables. for table in tables: redshift.execute( 'TRUNCATE {schema}.{table}', schema=self.staging_schema_identifier, table=psycopg2.sql.Identifier(table), ) app.logger.info( f'Will generate feeds for {len(calnet_profiles)} students (term_id={term_id}).' ) successes = [] failures = [] index = 1 for sid, profile_group in groupby(calnet_profiles, operator.itemgetter('sid')): app.logger.info( f'Generating feeds for sid {sid} ({index} of {len(calnet_profiles)})' ) index += 1 merged_profile = self.generate_or_fetch_merged_profile( term_id, sid, list(profile_group)[0]) if merged_profile: self.generate_merged_enrollment_terms(merged_profile, term_id) self.parse_holds(sid) successes.append(sid) else: failures.append(sid) for table in tables: if not self.rows[table]: continue self.upload_to_staging(table) if not self.verify_table(table): return False with redshift.transaction() as transaction: for table in tables: if not self.refresh_from_staging(table, term_id, sids, transaction): app.logger.error( f'Failed to refresh {self.destination_schema}.{table} from staging.' ) return False if not transaction.commit(): app.logger.error( f'Final transaction commit failed for {self.destination_schema}.' ) return False with rds.transaction() as transaction: if self.refresh_rds_indexes(sids, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() app.logger.error('Failed to refresh RDS indexes.') return False update_merged_feed_status(term_id, successes, failures) app.logger.info(f'Updated merged feed status.') return f'Merged profile generation complete: {len(successes)} successes, {len(failures)} failures.'