Пример #1
0
 def run(self):
     app.logger.info(f'Starting student schema creation job...')
     app.logger.info(f'Executing SQL...')
     resolved_ddl = resolve_sql_template(
         'create_student_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info(
             f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}' found or created."
         )
     else:
         app.logger.error(f'Student schema creation failed.')
         return False
     resolved_ddl_staging = resolve_sql_template(
         'create_student_schema.template.sql',
         redshift_schema_student=app.config['REDSHIFT_SCHEMA_STUDENT'] +
         '_staging',
     )
     if redshift.execute_ddl_script(resolved_ddl_staging):
         app.logger.info(
             f"Schema '{app.config['REDSHIFT_SCHEMA_STUDENT']}_staging' found or created."
         )
     else:
         app.logger.error(f'Student staging schema creation failed.')
         return False
     return True
    def run(self):
        app.logger.info('Start generating canvas caliper analytics')
        redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER']
        redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS']
        canvas_caliper_explode_table = 'canvas_caliper_explode'

        # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch
        # may have been delivered before last midnight UTC.
        s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path()
        if not s3.get_keys_with_prefix(s3_caliper_daily_path):
            s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_caliper_daily_path):
                raise BackgroundJobError('No timely S3 Caliper extracts found')
            else:
                app.logger.info('Falling back S3 Caliper extracts for yesterday')
        s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path)

        resolved_ddl_caliper_explode = resolve_sql_template(
            'create_lrs_canvas_explode_table.template.sql',
            canvas_caliper_explode_table=canvas_caliper_explode_table,
            loch_s3_caliper_explode_url=s3_caliper_daily_url,
        )
        redshift.drop_external_schema(redshift_schema_lrs_external)
        if redshift.execute_ddl_script(resolved_ddl_caliper_explode):
            app.logger.info('Caliper explode schema and table successfully created.')
        else:
            raise BackgroundJobError('Caliper explode schema and table creation failed.')

        # Sanity-check event times from the latest Caliper batch against previously transformed event times.
        def datetime_from_query(query):
            response = redshift.fetch(query)
            timestamp = response and response[0] and response[0].get('timestamp')
            if not timestamp:
                raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}')
            if isinstance(timestamp, str):
                timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
            return timestamp

        earliest_untransformed = datetime_from_query(
            f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}',
        )
        latest_transformed = datetime_from_query(
            f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests',
        )
        if not earliest_untransformed or not latest_transformed:
            return False
        timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds()
        lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE']
        if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance:
            raise BackgroundJobError(
                f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, '
                f'earliest untransformed {earliest_untransformed}',
            )

        resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql')
        if redshift.execute_ddl_script(resolved_ddl_caliper_analytics):
            return 'Caliper analytics tables successfully created.'
        else:
            raise BackgroundJobError('Caliper analytics tables creation failed.')
Пример #3
0
    def test_execute_ddl_script(self, app, ensure_drop_schema):
        """Executes filled SQL template files one statement at a time."""
        # TODO Test CREATE EXTERNAL SCHEMA and CREATE EXTERNAL TABLE statements.
        resolved_ddl = resolve_sql_template('test_db.template.sql')
        redshift.execute_ddl_script(resolved_ddl)

        schema = psycopg2.sql.Identifier(app.config['REDSHIFT_SCHEMA_BOAC'])
        result = redshift.fetch('SELECT COUNT(*) FROM {schema}.students', schema=schema)
        assert len(result) == 1
        assert result[0]['count'] == 7
    def create_schema(self):
        base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH']
        external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        # Flatten E&I-sourced JSON files into two schema-friendly JSON files.
        notes = []
        topics = []
        for key in s3.get_keys_with_prefix(base_s3_key):
            if key.endswith('.json'):
                notes_json = s3.get_object_json(key)
                if notes_json and 'notes' in notes_json:
                    notes += notes_json['notes']
                    for note in notes:
                        topics += _extract_topics(note)

        if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \
                and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'):
            # Create schema
            app.logger.info('Executing SQL...')
            resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql')
            if redshift.execute_ddl_script(resolved_ddl):
                verify_external_schema(external_schema, resolved_ddl)
            else:
                raise BackgroundJobError('E&I Advising Notes schema creation job failed.')
        else:
            raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
    def run(self):
        app.logger.info('Starting intermediate table generation job...')

        sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config[
            'FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS']

        resolved_ddl_redshift = resolve_sql_template(
            'create_intermediate_schema.template.sql',
            current_term_id=current_term_id(),
            redshift_schema_sis=sis_source_schema,
        )
        if redshift.execute_ddl_script(resolved_ddl_redshift):
            app.logger.info('Redshift tables generated.')
        else:
            raise BackgroundJobError('Intermediate table creation job failed.')

        resolved_ddl_rds = resolve_sql_template(
            'update_rds_indexes_sis.template.sql')
        if rds.execute(resolved_ddl_rds):
            app.logger.info('RDS indexes updated.')
        else:
            raise BackgroundJobError(
                'Failed to update RDS indexes for intermediate schema.')

        return 'Intermediate table generation job completed.'
    def run(self):
        app.logger.info('Starting Canvas schema creation job...')

        canvas_path = get_s3_canvas_daily_path()
        if not s3.get_keys_with_prefix(canvas_path):
            canvas_path = get_s3_canvas_daily_path(datetime.now() -
                                                   timedelta(days=1))
            if not s3.get_keys_with_prefix(canvas_path):
                raise BackgroundJobError(
                    'No timely Canvas data found, aborting')
            else:
                app.logger.info('Falling back to yesterday\'s Canvas data')

        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
        s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/'
        s3_canvas_data_url = s3_prefix + canvas_path
        s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term(
        )

        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template(
            'create_canvas_schema.template.sql',
            loch_s3_canvas_data_path_today=s3_canvas_data_url,
            loch_s3_canvas_data_path_current_term=
            s3_canvas_data_path_current_term,
        )
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'Canvas schema creation job completed.'
        else:
            raise BackgroundJobError('Canvas schema creation job failed.')
    def verify_post_transform_statement_count(self, url):
        schema = app.config['REDSHIFT_SCHEMA_LRS']
        resolved_ddl_transient = resolve_sql_template(
            'create_lrs_canvas_explode_table.template.sql',
            redshift_schema_lrs_external=schema,
            canvas_caliper_explode_table='caliper_statements_explode_transient',
            loch_s3_caliper_explode_url=url,
        )
        if redshift.execute_ddl_script(resolved_ddl_transient):
            app.logger.info(f"caliper_statements_explode_transient table created in schema '{schema}'.")
        else:
            raise BackgroundJobError(f"Failed to create caliper_statements_explode_transient table in schema '{schema}'.")

        exploded_statement_response = redshift.fetch(f'select count(*) from {schema}.caliper_statements_explode_transient')
        if exploded_statement_response:
            exploded_statement_count = exploded_statement_response[0].get('count')
        else:
            raise BackgroundJobError(f"Failed to verify caliper_statements_explode_transient table in schema '{schema}'.")

        if exploded_statement_count == self.pre_transform_statement_count:
            app.logger.info(f'Verified {exploded_statement_count} transformed statements migrated to {url}.')
        else:
            raise BackgroundJobError(
                f'Discrepancy between pre-transform statement count ({self.pre_transform_statement_count} statements)'
                f' and transformed statements at {url} ({exploded_statement_count} statements).')
Пример #8
0
    def verify_migration(self, incremental_url, incremental_schema):
        redshift.drop_external_schema(incremental_schema)
        resolved_ddl_transient = resolve_sql_template(
            'create_lrs_statements_table.template.sql',
            redshift_schema_lrs_external=incremental_schema,
            loch_s3_lrs_statements_path=incremental_url,
        )
        if redshift.execute_ddl_script(resolved_ddl_transient):
            app.logger.info(
                f"LRS incremental schema '{incremental_schema}' created.")
        else:
            raise BackgroundJobError(
                f"LRS incremental schema '{incremental_schema}' creation failed."
            )

        redshift_response = redshift.fetch(
            f'select count(*) from {incremental_schema}.statements')
        if redshift_response:
            redshift_statement_count = redshift_response[0].get('count')
        else:
            raise BackgroundJobError(
                f"Failed to verify LRS incremental schema '{incremental_schema}'."
            )

        if redshift_statement_count == self.lrs_statement_count:
            app.logger.info(
                f'Verified {redshift_statement_count} rows migrated from LRS to {incremental_url}.'
            )
        else:
            raise BackgroundJobError(
                f'Discrepancy between LRS ({self.lrs_statement_count} statements)'
                f' and {incremental_url} ({redshift_statement_count} statements).'
            )
Пример #9
0
    def verify_unloaded_count(self, url):
        url = path.split(url)[0]
        schema = app.config['REDSHIFT_SCHEMA_LRS']
        resolved_ddl_transient_unloaded = resolve_sql_template(
            'create_lrs_statements_unloaded_table.template.sql',
            redshift_schema_lrs_external=schema,
            loch_s3_lrs_statements_unloaded_path=url,
        )
        if redshift.execute_ddl_script(resolved_ddl_transient_unloaded):
            app.logger.info(
                f"statements_unloaded table created in schema '{schema}'.")
        else:
            raise BackgroundJobError(
                f"Failed to create statements_unloaded table in schema '{schema}'."
            )

        redshift_response = redshift.fetch(
            f'select count(*) from {schema}.statements_unloaded')
        if redshift_response:
            unloaded_statement_count = redshift_response[0].get('count')
        else:
            raise BackgroundJobError('Failed to get unloaded statement count.')

        if unloaded_statement_count == self.lrs_statement_count:
            app.logger.info(
                f'Verified {unloaded_statement_count} unloaded from LRS to {url}.'
            )
        else:
            raise BackgroundJobError(
                f'Discrepancy between LRS ({self.lrs_statement_count} statements)'
                f' and {url} ({unloaded_statement_count} statements).')
Пример #10
0
    def create_schema(self):
        app.logger.info('Executing SQL...')
        template_sql = 'create_edl_schema.template.sql'
        resolved_ddl = resolve_sql_template(template_sql)
        if not redshift.execute_ddl_script(resolved_ddl):
            raise BackgroundJobError('EDL SIS schema creation job failed.')
        # Create staging schema
        resolved_ddl_staging = resolve_sql_template(
            template_sql,
            redshift_schema_edl=f'{self.internal_schema}_staging',
        )
        if redshift.execute_ddl_script(resolved_ddl_staging):
            app.logger.info(f"Schema '{self.internal_schema}_staging' found or created.")
        else:
            raise BackgroundJobError(f'{self.internal_schema} schema creation failed.')

        app.logger.info('Redshift EDL schema created.')
Пример #11
0
 def run(self):
     app.logger.info(f'Starting BOAC analytics job...')
     resolved_ddl = resolve_sql_template('create_boac_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         return 'BOAC analytics creation job completed.'
     else:
         app.logger.error(f'BOAC analytics creation job failed.')
         return False
 def create_internal_schema(self, external_schema, daily_path):
     bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
     s3_data_url = f's3://{bucket}/{daily_path}/advising-notes'
     resolved_ddl = resolve_sql_template('create_sis_advising_notes_schema.template.sql', loch_s3_sis_notes_path_today=s3_data_url)
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS Advising Notes schema creation job failed to load incremental data and create internal schema.')
Пример #13
0
 def create_schema(self):
     app.logger.info('Executing SQL...')
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template(
         'create_sis_terms_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS terms schema creation job failed.')
 def create_historical_tables(self, external_schema):
     resolved_ddl = resolve_sql_template(
         'create_sis_advising_notes_historical_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError(
             'SIS Advising Notes schema creation job failed to load historical data.'
         )
Пример #15
0
 def run(self):
     app.logger.info(f'Starting intermediate table generation job...')
     resolved_ddl = resolve_sql_template(
         'create_intermediate_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         return 'Intermediate table creation job completed.'
     else:
         app.logger.error(f'Intermediate table creation job failed.')
         return False
Пример #16
0
 def run(self):
     app.logger.info('Starting ASC schema creation job...')
     app.logger.info('Executing SQL...')
     resolved_ddl = resolve_sql_template('create_asc_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info(f"Schema '{app.config['REDSHIFT_SCHEMA_ASC']}' found or created.")
     else:
         raise BackgroundJobError('ASC schema creation failed.')
     return True
Пример #17
0
 def run(self):
     app.logger.info('Starting Redshift user privileges job...')
     app.logger.info('Executing SQL...')
     resolved_ddl = resolve_sql_template('restore_redshift_user_privileges.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info('Restored Redshift user privileges successfully.')
     else:
         raise BackgroundJobError('Redshift user privilege restoration failed.')
         return False
     return True
Пример #18
0
 def run(self):
     app.logger.info(f'Starting Canvas schema creation job...')
     external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_canvas_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info(f'Canvas schema creation job completed.')
         return verify_external_schema(external_schema, resolved_ddl)
     else:
         app.logger.error(f'Canvas schema creation job failed.')
         return False
Пример #19
0
 def run(self):
     app.logger.info('Starting EDW schema creation job...')
     app.logger.info('Executing SQL...')
     external_schema = app.config['REDSHIFT_SCHEMA_EDW']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_edw_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         return 'EDW schema creation job completed.'
     else:
         raise BackgroundJobError('EDW schema creation job failed.')
    def create_schema(self):
        external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING']
        redshift.drop_external_schema(external_schema)

        resolved_ddl = resolve_sql_template(
            'create_data_science_advising_schema.template.sql')
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError(
                'Data Science Advising schema creation job failed.')
Пример #21
0
    def run(self):
        success = 0
        failure = 0
        berkeleyx_tenants = {
            'edx': {
                'database': 'berkeleyx_prod_ext',
                'instance': 'prod-analytics',
            },
            'edge': {
                'database': 'berkeleyx_prod_edge_ext',
                'instance': 'prod-edge-analytics',
            },
        }

        for tenant, value in berkeleyx_tenants.items():
            app.logger.info(
                f'Starting Berkeleyx schema creation job for {tenant}...')
            berkeleyx_data_path = 's3://{}/{}/{}'.format(
                app.config['LOCH_EDX_S3_BUCKET'],
                app.config['LOCH_EDX_S3_WEEKLY_DATA_PATH'],
                value['instance'],
            )
            berkeleyx_transaction_log_path = 's3://{}/{}/{}'.format(
                app.config['LOCH_EDX_S3_BUCKET'],
                app.config['LOCH_EDX_S3_TRANSACTION_LOG_PATH'],
                tenant,
            )
            external_schema = value['database'] + '_' + app.config[
                'LOCH_EDX_NESSIE_ENV']
            redshift.drop_external_schema(external_schema)
            resolved_ddl = resolve_sql_template(
                'create_berkeleyx_schema.template.sql',
                loch_s3_berkeleyx_data_path=berkeleyx_data_path,
                loch_s3_berkeleyx_transaction_log_path=
                berkeleyx_transaction_log_path,
                redshift_berkeleyx_ext_schema=external_schema,
            )

            if redshift.execute_ddl_script(resolved_ddl):
                app.logger.info(
                    f'BerkeleyX schema {external_schema} creation completed.')
                success += 1
            else:
                app.logger.error(
                    f'BerkeleyX schema creation {external_schema} failed.')
                failure += 1

        if failure > 0:
            raise BackgroundJobError('Berkeleyx Schema creation jobs failed')
        else:
            app.logger.info(
                'Bekreleyx schema creation jobs completed successfully')
            return True
Пример #22
0
 def create_external_schema(self):
     if not self.feature_flag_edl:
         # External schema is necessary when pulling term definitions from S3.
         redshift.drop_external_schema(self.redshift_schema)
         sql_template = 'create_terms_schema.template.sql'
         app.logger.info(f'Executing {sql_template}...')
         resolved_ddl = resolve_sql_template(sql_template)
         if redshift.execute_ddl_script(resolved_ddl):
             verify_external_schema(self.redshift_schema, resolved_ddl)
         else:
             raise BackgroundJobError(
                 'SIS terms schema creation job failed.')
Пример #23
0
 def run(self):
     app.logger.info(f'Starting metadata schema creation job...')
     app.logger.info(f'Executing SQL...')
     resolved_ddl = resolve_sql_template(
         'create_metadata_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info(
             f"Schema '{app.config['REDSHIFT_SCHEMA_METADATA']}' found or created."
         )
     else:
         app.logger.error(f'Metadata schema creation failed.')
         return False
     return True
Пример #24
0
 def run(self):
     app.logger.info('Starting Gradescope schema creation job...')
     app.logger.info('Executing SQL...')
     # Add frequency of ingest and dynamic path creation logic once ingest mechanisms are confirmed
     # For now adding a top level hierarchy to catalog bulk snapshot.
     external_schema = app.config['REDSHIFT_SCHEMA_GRADESCOPE']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_gradescope_schema_template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         return 'Gradescope schema creation job completed.'
     else:
         raise BackgroundJobError('Gradescope schema creation job failed.')
Пример #25
0
 def run(self):
     app.logger.info('Starting SIS schema creation job...')
     if not self.update_manifests():
         app.logger.info('Error updating manifests, will not execute schema creation SQL')
         return False
     app.logger.info('Executing SQL...')
     external_schema = app.config['REDSHIFT_SCHEMA_SIS']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_sis_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS schema creation job failed.')
     return 'SIS schema creation job completed.'
Пример #26
0
 def run(self):
     app.logger.info('Starting OUA Slate schema creation job...')
     app.logger.info('Executing SQL...')
     self.migrate_oua_sftp_data()
     external_schema = app.config['REDSHIFT_SCHEMA_OUA']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_oua_schema_template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         self.create_rds_tables_and_indexes()
         app.logger.info('OUA Slate RDS indexes created.')
         return 'OUA schema creation job completed.'
     else:
         raise BackgroundJobError('OUA Slate schema creation job failed.')
 def create_schema(self):
     external_schema = app.config['REDSHIFT_SCHEMA_ASC_ADVISING_NOTES']
     redshift.drop_external_schema(external_schema)
     asc_data_sftp_path = '/'.join([
         f"s3://{app.config['LOCH_S3_BUCKET']}",
         app.config['LOCH_S3_ASC_DATA_SFTP_PATH'],
     ])
     resolved_ddl = resolve_sql_template(
         'create_asc_advising_notes_schema.template.sql',
         asc_data_sftp_path=asc_data_sftp_path,
     )
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('ASC Advising Notes schema creation job failed.')
Пример #28
0
    def run(self):
        app.logger.info('Starting YCBM schema creation job...')

        external_schema = app.config['REDSHIFT_SCHEMA_YCBM']
        redshift.drop_external_schema(external_schema)

        sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config['FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS']
        resolved_ddl = resolve_sql_template(
            'create_ycbm_schema.template.sql',
            redshift_schema_sis=sis_source_schema,
        )

        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'YCBM schema creation job completed.'
        else:
            raise BackgroundJobError('YCBM schema creation job failed.')