def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def create_internal_schema(self, external_schema, daily_path): bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] s3_data_url = f's3://{bucket}/{daily_path}/advising-notes' resolved_ddl = resolve_sql_template('create_sis_advising_notes_schema.template.sql', loch_s3_sis_notes_path_today=s3_data_url) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS Advising Notes schema creation job failed to load incremental data and create internal schema.')
def create_historical_tables(self, external_schema): resolved_ddl = resolve_sql_template( 'create_sis_advising_notes_historical_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS Advising Notes schema creation job failed to load historical data.' )
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_sis_terms_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS terms schema creation job failed.')
def run(self): app.logger.info('Starting EDW schema creation job...') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_EDW'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_edw_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'EDW schema creation job completed.' else: raise BackgroundJobError('EDW schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_data_science_advising_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'Data Science Advising schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_ADVISOR'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_advisor_schema.template.sql', advisor_data_path=self.s3_path(), ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('Advisor schema creation job failed.')
def create_external_schema(self): if not self.feature_flag_edl: # External schema is necessary when pulling term definitions from S3. redshift.drop_external_schema(self.redshift_schema) sql_template = 'create_terms_schema.template.sql' app.logger.info(f'Executing {sql_template}...') resolved_ddl = resolve_sql_template(sql_template) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(self.redshift_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS terms schema creation job failed.')
def run(self): app.logger.info('Starting Gradescope schema creation job...') app.logger.info('Executing SQL...') # Add frequency of ingest and dynamic path creation logic once ingest mechanisms are confirmed # For now adding a top level hierarchy to catalog bulk snapshot. external_schema = app.config['REDSHIFT_SCHEMA_GRADESCOPE'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_gradescope_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Gradescope schema creation job completed.' else: raise BackgroundJobError('Gradescope schema creation job failed.')
def run(self): app.logger.info('Starting SIS schema creation job...') if not self.update_manifests(): app.logger.info('Error updating manifests, will not execute schema creation SQL') return False app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_sis_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS schema creation job failed.') return 'SIS schema creation job completed.'
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') self.migrate_oua_sftp_data() external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError('OUA Slate schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_ASC_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) asc_data_sftp_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_ASC_DATA_SFTP_PATH'], ]) resolved_ddl = resolve_sql_template( 'create_asc_advising_notes_schema.template.sql', asc_data_sftp_path=asc_data_sftp_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('ASC Advising Notes schema creation job failed.')
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(self.external_schema) s3_sis_daily = get_s3_sis_sysadm_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = _get_yesterdays_advisor_data() s3_path = '/'.join([f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors']) sql_filename = 'edl_create_advisor_schema.template.sql' if self.feature_flag_edl else 'create_advisor_schema.template.sql' resolved_ddl = resolve_sql_template(sql_filename, advisor_data_path=s3_path) if not redshift.execute_ddl_script(resolved_ddl): raise BackgroundJobError(f'Redshift execute_ddl_script failed on {sql_filename}') verify_external_schema(self.external_schema, resolved_ddl) app.logger.info('Redshift schema created.')
def run(self): app.logger.info('Starting Canvas API schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS_API'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_api_data_url = s3_prefix + get_s3_canvas_api_path( transformed=True) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_api_schema.template.sql', loch_s3_canvas_api_data_path=s3_canvas_api_data_url, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas API schema creation job completed.' else: raise BackgroundJobError('Canvas API schema creation job failed.')
def run(self): app.logger.info('Starting YCBM schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_YCBM'] redshift.drop_external_schema(external_schema) sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config['FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS'] resolved_ddl = resolve_sql_template( 'create_ycbm_schema.template.sql', redshift_schema_sis=sis_source_schema, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'YCBM schema creation job completed.' else: raise BackgroundJobError('YCBM schema creation job failed.')
def run(self): app.logger.info('Starting SIS schema creation job...') if not self.update_manifests(): app.logger.info( 'Error updating manifests, will not execute schema creation SQL' ) return False app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_sis_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl, is_zero_count_acceptable=feature_flag_edl()) else: raise BackgroundJobError('SIS schema creation job failed.') return 'SIS schema creation job completed.'
def run(self): app.logger.info('Starting CalNet schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CALNET'] redshift.drop_external_schema(external_schema) sid_snapshot_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_CALNET_DATA_PATH'], 'sids', ]) resolved_ddl = resolve_sql_template( 'create_calnet_schema.template.sql', sid_snapshot_path=sid_snapshot_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'CalNet schema creation job completed.' else: raise BackgroundJobError('CalNet schema creation job failed.')
def run(self): app.logger.info(f'Starting Canvas schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_canvas_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f'Canvas schema creation job completed.') return verify_external_schema(external_schema, resolved_ddl) else: app.logger.error(f'Canvas schema creation job failed.') return False
def run(self): app.logger.info('Starting Undergrads schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_undergrads_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info('Undergrads external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('Undergrads external schema creation failed.') undergrads_rows = redshift.fetch(f'SELECT * FROM {external_schema}.students ORDER by sid') with rds.transaction() as transaction: if self.refresh_rds_indexes(undergrads_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'Undergrads internal schema created.'
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] oua_slate_sftp_path = app.config[ 'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset( ) + '/' oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/' # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location keys = s3.get_keys_with_prefix(oua_slate_sftp_path, full_objects=False, bucket=s3_protected_bucket) if len(keys) > 0: for source_key in keys: if source_key.endswith('.csv'): destination_key = source_key.replace( oua_slate_sftp_path, oua_daily_dest_path) if not s3.copy(s3_protected_bucket, source_key, s3_protected_bucket, destination_key): raise BackgroundJobError( f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.' ) external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError( 'OUA Slate schema creation job failed.') else: return 'No OUA files found in SFTP location today. Skipping OUA data refresh'
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'