def verify_migration(self, incremental_url, incremental_schema): redshift.drop_external_schema(incremental_schema) resolved_ddl_transient = resolve_sql_template( 'create_lrs_statements_table.template.sql', redshift_schema_lrs_external=incremental_schema, loch_s3_lrs_statements_path=incremental_url, ) if redshift.execute_ddl_script(resolved_ddl_transient): app.logger.info( f"LRS incremental schema '{incremental_schema}' created.") else: raise BackgroundJobError( f"LRS incremental schema '{incremental_schema}' creation failed." ) redshift_response = redshift.fetch( f'select count(*) from {incremental_schema}.statements') if redshift_response: redshift_statement_count = redshift_response[0].get('count') else: raise BackgroundJobError( f"Failed to verify LRS incremental schema '{incremental_schema}'." ) if redshift_statement_count == self.lrs_statement_count: app.logger.info( f'Verified {redshift_statement_count} rows migrated from LRS to {incremental_url}.' ) else: raise BackgroundJobError( f'Discrepancy between LRS ({self.lrs_statement_count} statements)' f' and {incremental_url} ({redshift_statement_count} statements).' )
def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def migrate_transient_to_destination(self, keys, destination_bucket, destination_path, unload_to_etl=False): destination_url = 's3://' + destination_bucket + '/' + destination_path destination_schema = app.config['REDSHIFT_SCHEMA_LRS'] for transient_key in keys: destination_key = transient_key.replace(self.transient_path, destination_path) if not s3.copy(self.transient_bucket, transient_key, destination_bucket, destination_key): app.logger.error( f'Copy from transient bucket to destination bucket {destination_bucket} failed.' ) return False if not self.verify_migration(destination_url, destination_schema): return False if unload_to_etl: if not self.unload_to_etl(destination_schema, destination_bucket): app.logger.error( f'Redshift statements unload from {destination_schema} to {destination_bucket} failed.' ) return False redshift.drop_external_schema(destination_schema) return True
def verify_and_unload_transient(self): transient_url = f's3://{self.transient_bucket}/{self.transient_path}' transient_schema = app.config['REDSHIFT_SCHEMA_LRS'] + '_transient' self.verify_migration(transient_url, transient_schema) self.delete_old_unloads() self.unload_to_etl(transient_schema, self.transient_bucket, timestamped=False) redshift.drop_external_schema(transient_schema)
def run(self): app.logger.info('Starting SIS Advising Notes schema creation job...') daily_path = get_s3_sis_sysadm_daily_path() bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): daily_path = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): raise BackgroundJobError( 'No timely SIS advising notes data found, aborting') else: app.logger.info( 'Falling back to yesterday\'s SIS advising notes data') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) self.create_historical_tables(external_schema) self.create_internal_schema(external_schema, daily_path) app.logger.info('Redshift schema created. Creating RDS indexes...') self.create_indexes() app.logger.info('RDS indexes created.') return 'SIS Advising Notes schema creation job completed.'
def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def run(self): app.logger.info('Start generating canvas caliper analytics') redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER'] redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS'] canvas_caliper_explode_table = 'canvas_caliper_explode' # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch # may have been delivered before last midnight UTC. s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path() if not s3.get_keys_with_prefix(s3_caliper_daily_path): s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_caliper_daily_path): raise BackgroundJobError('No timely S3 Caliper extracts found') else: app.logger.info('Falling back S3 Caliper extracts for yesterday') s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path) resolved_ddl_caliper_explode = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', canvas_caliper_explode_table=canvas_caliper_explode_table, loch_s3_caliper_explode_url=s3_caliper_daily_url, ) redshift.drop_external_schema(redshift_schema_lrs_external) if redshift.execute_ddl_script(resolved_ddl_caliper_explode): app.logger.info('Caliper explode schema and table successfully created.') else: raise BackgroundJobError('Caliper explode schema and table creation failed.') # Sanity-check event times from the latest Caliper batch against previously transformed event times. def datetime_from_query(query): response = redshift.fetch(query) timestamp = response and response[0] and response[0].get('timestamp') if not timestamp: raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}') if isinstance(timestamp, str): timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') return timestamp earliest_untransformed = datetime_from_query( f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}', ) latest_transformed = datetime_from_query( f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests', ) if not earliest_untransformed or not latest_transformed: return False timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds() lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE'] if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance: raise BackgroundJobError( f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, ' f'earliest untransformed {earliest_untransformed}', ) resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql') if redshift.execute_ddl_script(resolved_ddl_caliper_analytics): return 'Caliper analytics tables successfully created.' else: raise BackgroundJobError('Caliper analytics tables creation failed.')
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_sis_terms_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS terms schema creation job failed.')
def migrate_transient_to_destination(self, keys, destination_bucket, destination_path): destination_url = 's3://' + destination_bucket + '/' + destination_path redshift_schema = app.config['REDSHIFT_SCHEMA_LRS'] for transient_key in keys: destination_key = transient_key.replace(self.transient_path, destination_path) if not s3.copy(self.transient_bucket, transient_key, destination_bucket, destination_key): raise BackgroundJobError(f'Copy from transient bucket to destination bucket {destination_bucket} failed.') self.verify_migration(destination_url, redshift_schema) redshift.drop_external_schema(redshift_schema)
def run(self): app.logger.info('Starting EDW schema creation job...') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_EDW'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_edw_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'EDW schema creation job completed.' else: raise BackgroundJobError('EDW schema creation job failed.')
def run(self): app.logger.info(f'Starting Canvas schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_canvas_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info(f'Canvas schema creation job completed.') return verify_external_schema(external_schema, resolved_ddl) else: app.logger.error(f'Canvas schema creation job failed.') return False
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_data_science_advising_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError( 'Data Science Advising schema creation job failed.')
def run(self): success = 0 failure = 0 berkeleyx_tenants = { 'edx': { 'database': 'berkeleyx_prod_ext', 'instance': 'prod-analytics', }, 'edge': { 'database': 'berkeleyx_prod_edge_ext', 'instance': 'prod-edge-analytics', }, } for tenant, value in berkeleyx_tenants.items(): app.logger.info( f'Starting Berkeleyx schema creation job for {tenant}...') berkeleyx_data_path = 's3://{}/{}/{}'.format( app.config['LOCH_EDX_S3_BUCKET'], app.config['LOCH_EDX_S3_WEEKLY_DATA_PATH'], value['instance'], ) berkeleyx_transaction_log_path = 's3://{}/{}/{}'.format( app.config['LOCH_EDX_S3_BUCKET'], app.config['LOCH_EDX_S3_TRANSACTION_LOG_PATH'], tenant, ) external_schema = value['database'] + '_' + app.config[ 'LOCH_EDX_NESSIE_ENV'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_berkeleyx_schema.template.sql', loch_s3_berkeleyx_data_path=berkeleyx_data_path, loch_s3_berkeleyx_transaction_log_path= berkeleyx_transaction_log_path, redshift_berkeleyx_ext_schema=external_schema, ) if redshift.execute_ddl_script(resolved_ddl): app.logger.info( f'BerkeleyX schema {external_schema} creation completed.') success += 1 else: app.logger.error( f'BerkeleyX schema creation {external_schema} failed.') failure += 1 if failure > 0: raise BackgroundJobError('Berkeleyx Schema creation jobs failed') else: app.logger.info( 'Bekreleyx schema creation jobs completed successfully') return True
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_ADVISOR'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_advisor_schema.template.sql', advisor_data_path=self.s3_path(), ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('Advisor schema creation job failed.')
def create_external_schema(self): if not self.feature_flag_edl: # External schema is necessary when pulling term definitions from S3. redshift.drop_external_schema(self.redshift_schema) sql_template = 'create_terms_schema.template.sql' app.logger.info(f'Executing {sql_template}...') resolved_ddl = resolve_sql_template(sql_template) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(self.redshift_schema, resolved_ddl) else: raise BackgroundJobError( 'SIS terms schema creation job failed.')
def run(self): app.logger.info('Starting Gradescope schema creation job...') app.logger.info('Executing SQL...') # Add frequency of ingest and dynamic path creation logic once ingest mechanisms are confirmed # For now adding a top level hierarchy to catalog bulk snapshot. external_schema = app.config['REDSHIFT_SCHEMA_GRADESCOPE'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_gradescope_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Gradescope schema creation job completed.' else: raise BackgroundJobError('Gradescope schema creation job failed.')
def run(self): app.logger.info('Starting SIS schema creation job...') if not self.update_manifests(): app.logger.info('Error updating manifests, will not execute schema creation SQL') return False app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_sis_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('SIS schema creation job failed.') return 'SIS schema creation job completed.'
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') self.migrate_oua_sftp_data() external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError('OUA Slate schema creation job failed.')
def create_schema(self): external_schema = app.config['REDSHIFT_SCHEMA_ASC_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) asc_data_sftp_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_ASC_DATA_SFTP_PATH'], ]) resolved_ddl = resolve_sql_template( 'create_asc_advising_notes_schema.template.sql', asc_data_sftp_path=asc_data_sftp_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('ASC Advising Notes schema creation job failed.')
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(self.external_schema) s3_sis_daily = get_s3_sis_sysadm_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = _get_yesterdays_advisor_data() s3_path = '/'.join([f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors']) sql_filename = 'edl_create_advisor_schema.template.sql' if self.feature_flag_edl else 'create_advisor_schema.template.sql' resolved_ddl = resolve_sql_template(sql_filename, advisor_data_path=s3_path) if not redshift.execute_ddl_script(resolved_ddl): raise BackgroundJobError(f'Redshift execute_ddl_script failed on {sql_filename}') verify_external_schema(self.external_schema, resolved_ddl) app.logger.info('Redshift schema created.')
def run(self): app.logger.info('Starting SIS schema creation job...') if not self.update_manifests(): app.logger.info( 'Error updating manifests, will not execute schema creation SQL' ) return False app.logger.info('Executing SQL...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_sis_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl, is_zero_count_acceptable=feature_flag_edl()) else: raise BackgroundJobError('SIS schema creation job failed.') return 'SIS schema creation job completed.'
def run(self): app.logger.info('Starting Canvas API schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS_API'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_api_data_url = s3_prefix + get_s3_canvas_api_path( transformed=True) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_api_schema.template.sql', loch_s3_canvas_api_data_path=s3_canvas_api_data_url, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas API schema creation job completed.' else: raise BackgroundJobError('Canvas API schema creation job failed.')
def run(self): app.logger.info('Starting YCBM schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_YCBM'] redshift.drop_external_schema(external_schema) sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config['FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS'] resolved_ddl = resolve_sql_template( 'create_ycbm_schema.template.sql', redshift_schema_sis=sis_source_schema, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'YCBM schema creation job completed.' else: raise BackgroundJobError('YCBM schema creation job failed.')
def run(self): app.logger.info('Starting CalNet schema creation job...') external_schema = app.config['REDSHIFT_SCHEMA_CALNET'] redshift.drop_external_schema(external_schema) sid_snapshot_path = '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", app.config['LOCH_S3_CALNET_DATA_PATH'], 'sids', ]) resolved_ddl = resolve_sql_template( 'create_calnet_schema.template.sql', sid_snapshot_path=sid_snapshot_path, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'CalNet schema creation job completed.' else: raise BackgroundJobError('CalNet schema creation job failed.')
def run(self): app.logger.info('Starting Undergrads schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_undergrads_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): app.logger.info('Undergrads external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('Undergrads external schema creation failed.') undergrads_rows = redshift.fetch(f'SELECT * FROM {external_schema}.students ORDER by sid') with rds.transaction() as transaction: if self.refresh_rds_indexes(undergrads_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'Undergrads internal schema created.'
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] oua_slate_sftp_path = app.config[ 'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset( ) + '/' oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/' # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location keys = s3.get_keys_with_prefix(oua_slate_sftp_path, full_objects=False, bucket=s3_protected_bucket) if len(keys) > 0: for source_key in keys: if source_key.endswith('.csv'): destination_key = source_key.replace( oua_slate_sftp_path, oua_daily_dest_path) if not s3.copy(s3_protected_bucket, source_key, s3_protected_bucket, destination_key): raise BackgroundJobError( f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.' ) external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError( 'OUA Slate schema creation job failed.') else: return 'No OUA files found in SFTP location today. Skipping OUA data refresh'
def run(self): app.logger.info('Starting COE schema creation job...') redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template('create_coe_schema.template.sql') # TODO This DDL drops and recreates the internal schema before the external schema is verified. We # ought to set up proper staging in conjunction with verification. It's also possible that a persistent # external schema isn't needed. if redshift.execute_ddl_script(resolved_ddl): app.logger.info('COE external schema created.') verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('COE external schema creation failed.') coe_rows = redshift.fetch( 'SELECT * FROM {schema}.students ORDER by sid', schema=internal_schema_identifier, ) profile_rows = [] index = 1 for sid, rows_for_student in groupby(coe_rows, operator.itemgetter('sid')): app.logger.info( f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})' ) index += 1 row_for_student = list(rows_for_student)[0] coe_profile = { 'advisorUid': row_for_student.get('advisor_ldap_uid'), 'gender': row_for_student.get('gender'), 'ethnicity': row_for_student.get('ethnicity'), 'minority': row_for_student.get('minority'), 'didPrep': row_for_student.get('did_prep'), 'prepEligible': row_for_student.get('prep_eligible'), 'didTprep': row_for_student.get('did_tprep'), 'tprepEligible': row_for_student.get('tprep_eligible'), 'sat1read': row_for_student.get('sat1read'), 'sat1math': row_for_student.get('sat1math'), 'sat2math': row_for_student.get('sat2math'), 'inMet': row_for_student.get('in_met'), 'gradTerm': row_for_student.get('grad_term'), 'gradYear': row_for_student.get('grad_year'), 'probation': row_for_student.get('probation'), 'status': row_for_student.get('status'), } profile_rows.append(encoded_tsv_row([sid, json.dumps(coe_profile)])) s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv' app.logger.info( f'Will stash {len(profile_rows)} feeds in S3: {s3_key}') if not s3.upload_tsv_rows(profile_rows, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {redshift_schema_coe}.student_profiles FROM '{loch_s3_coe_data_path}/coe_profiles.tsv' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.') with rds.transaction() as transaction: if self.refresh_rds_indexes(coe_rows, transaction): transaction.commit() app.logger.info('Refreshed RDS indexes.') else: transaction.rollback() raise BackgroundJobError('Error refreshing RDS indexes.') return 'COE internal schema created.'
def run(self, truncate_lrs=False): app.logger.info('Starting DMS replication task...') task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID'] self.transient_bucket = app.config[ 'LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'] self.transient_path = app.config[ 'LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH'] if not self.delete_old_incrementals(): return False response = dms.start_replication_task(task_id) if not response: app.logger.error( 'Failed to start DMS replication task (response={response}).') return False while True: response = dms.get_replication_task(task_id) if response.get('Status') == 'stopped': if response.get( 'StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED': app.logger.info('DMS replication task completed') break else: app.logger.error( f'Replication task stopped for unexpected reason: {response}' ) return False sleep(10) lrs_response = lrs.fetch('select count(*) from statements') if lrs_response: self.lrs_statement_count = lrs_response[0][0] else: app.logger.error( f'Failed to retrieve LRS statements for comparison.') return False transient_keys = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket) if not transient_keys: app.logger.error( 'Could not retrieve S3 keys from transient bucket.') return False transient_url = f's3://{self.transient_bucket}/{self.transient_path}' transient_schema = app.config['REDSHIFT_SCHEMA_LRS'] + '_transient' if not self.verify_migration(transient_url, transient_schema): return False redshift.drop_external_schema(transient_schema) timestamp_path = localize_datetime( datetime.now()).strftime('%Y/%m/%d/%H%M%S') destination_path = app.config[ 'LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path for destination_bucket in app.config[ 'LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']: if not self.migrate_transient_to_destination( transient_keys, destination_bucket, destination_path, unload_to_etl=True, ): return False if truncate_lrs: if lrs.execute('TRUNCATE statements'): app.logger.info('Truncated incremental LRS table.') else: app.logger.error('Failed to truncate incremental LRS table.') return False return ( f'Migrated {self.lrs_statement_count} statements to S3' f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})" )
def run(self): # Retrieve latest schema definitions from Canvas data API response = canvas_data.get_canvas_data_schema() external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] redshift_iam_role = app.config['REDSHIFT_IAM_ROLE'] canvas_schema = [] # Parse and isolate table and column details for key, value in response['schema'].items(): for column in value['columns']: # Not every column has description and length. description = None if 'description' in column: description = column['description'] length = None if 'length' in column: length = column['length'] canvas_schema.append([ value['tableName'], column['name'], column['type'], description, length, ]) # Create a dataframe schema_df = pd.DataFrame(canvas_schema) schema_df.columns = [ 'table_name', 'column_name', 'column_type', 'column_description', 'column_length', ] # The schema definitions received from Canvas are Redshift compliant. We update # cetain column types to match Glue and Spectrum data types. schema_df['glue_type'] = schema_df['column_type'].replace({ 'enum': 'varchar', 'guid': 'varchar', 'text': 'varchar(max)', 'date': 'timestamp', 'datetime': 'timestamp', }) schema_df['transformed_column_name'] = schema_df[ 'column_name'].replace({ 'default': '"default"', 'percent': '"percent"', }) # Create Hive compliant storage descriptors canvas_external_catalog_ddl = self.generate_external_catalog( external_schema, schema_df) # Clean up and recreate refreshed tables on Glue using Spectrum redshift.drop_external_schema(external_schema) redshift.create_external_schema(external_schema, redshift_iam_role) if redshift.execute_ddl_script(canvas_external_catalog_ddl): app.logger.info('Canvas schema creation job completed.') else: app.logger.error('Canvas schema creation job failed.') raise BackgroundJobError('Canvas schema creation job failed.') self.verify_external_data_catalog() return 'Canvas external schema created and verified.'