Python resolve_sql_template_string示例，nessie.lib.util.resolve_sql_template_string Python示例

示例#1

0

显示文件

    def import_advisor_attributes(self):
        csid_results = redshift.fetch(
            resolve_sql_template_string(
                'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students'
            ), )
        csids = [r['advisor_sid'] for r in csid_results]
        all_attributes = calnet.client(app).search_csids(csids)
        if len(csids) != len(all_attributes):
            ldap_csids = [person['csid'] for person in all_attributes]
            missing = set(csids) - set(ldap_csids)
            app.logger.warning(
                f'Looked for {len(csids)} advisor CSIDs but only found {len(all_attributes)} : missing {missing}'
            )

        advisor_rows = []
        total_count = len(all_attributes)
        for index, a in enumerate(all_attributes):
            sid = a['csid']
            app.logger.info(
                f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {total_count})'
            )
            first_name, last_name = calnet.split_sortable_name(a)
            data = [
                a['uid'],
                sid,
                first_name,
                last_name,
                a['title'],
                calnet.get_dept_code(a),
                a['email'],
                a['campus_email'],
            ]
            advisor_rows.append(encoded_tsv_row(data))

        s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv'
        app.logger.info(
            f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(advisor_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes;
            COPY {redshift_schema_advisor_internal}.advisor_attributes
                FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

示例#2

0

显示文件

 def import_advisor_attributes(self):
     if self.feature_flag_edl:
         sql = resolve_sql_template_string("""
             SELECT DISTINCT advisor_id
             FROM {redshift_schema_edl_external}.student_advisor_data
             WHERE academic_career_cd = 'UGRD' AND advisor_id ~ '[0-9]+'
         """)
         advisor_ids = [row['advisor_id'] for row in redshift.fetch(sql)]
     else:
         sql = resolve_sql_template_string(
             'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students'
         )
         advisor_ids = [row['advisor_sid'] for row in redshift.fetch(sql)]
     return _import_calnet_attributes(advisor_ids)

示例#3

0

显示文件

文件： import_non_current_students.py 项目： ets-berkeley-edu/nessie

    def run(self):
        app.logger.info('Starting BOA manually added advisees import job...')
        feed = get_manually_added_advisees()

        if feed.get('error'):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        rows = [advisee['sid'].encode() for advisee in feed.get('feed')]
        s3_key = f'{get_s3_boa_api_daily_path()}/manually-added-advisees/manually-added-advisees.tsv'
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Copying data from S3 file to Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_advisee}.non_current_students;
            COPY {redshift_schema_advisee}.non_current_students
                FROM 's3://{s3_bucket}/{s3_key}'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            s3_bucket=app.config['LOCH_S3_BUCKET'],
            s3_key=s3_key,
        )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        status = f'Imported {len(rows)} non-current students.'
        app.logger.info(
            f'BOA manually added advisees import job completed: {status}')
        return status

示例#4

0

显示文件

文件： import_sis_student_api.py 项目： dkawase/nessie

    def run(self, csids=None):
        if app.config['STUDENT_V1_API_PREFERRED']:
            return self.run_v1(csids)
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(f'Starting SIS student API import job for {len(csids)} students...')

        rows, failure_count = self.load_concurrently(csids)
        if (len(rows) == 0) and (failure_count > 0):
            raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.')

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'):
            raise BackgroundJobError('Error truncating old staging rows: aborting job.')
        if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema_student}.sis_api_profiles
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """,
        )
        if not redshift.execute(staging_to_destination_query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'

示例#5

0

显示文件

    def run(self, sids=None):
        if not sids:
            sids = [row['sid'] for row in get_unfetched_non_advisees()]
        app.logger.info(
            f'Starting SIS student API import job for {len(sids)} non-advisees...'
        )

        with tempfile.TemporaryFile() as feed_file:
            saved_sids, failure_count = self.load_concurrently(sids, feed_file)
            if saved_sids:
                student_schema.truncate_staging_table(
                    'sis_api_profiles_hist_enr')
                student_schema.write_file_to_staging(
                    'sis_api_profiles_hist_enr', feed_file, len(saved_sids))

        if saved_sids:
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {redshift_schema_student}.sis_api_profiles_hist_enr WHERE sid IN
                    (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr);
                INSERT INTO {redshift_schema_student}.sis_api_profiles_hist_enr
                    (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles_hist_enr);
                TRUNCATE {redshift_schema_student}_staging.sis_api_profiles_hist_enr;
                """, )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError(
                    'Error on Redshift copy: aborting job.')

        return f'SIS student API non-advisee import job completed: {len(saved_sids)} succeeded, {failure_count} failed.'

示例#6

0

显示文件

def student_tables(app):
    """Use Postgres to mock the Redshift student schemas on local test runs."""
    from nessie.externals import rds, redshift
    from nessie.lib.util import resolve_sql_template_string, resolve_sql_template
    rds.execute(resolve_sql_template('create_rds_indexes.template.sql'))
    fixture_path = f"{app.config['BASE_DIR']}/fixtures"
    with open(f'{fixture_path}/students.sql', 'r') as sql_file:
        student_sql = sql_file.read()
    params = {}
    for key in [
            'sis_api_drops_and_midterms_11667051_2178',
            'sis_degree_progress_11667051',
            'sis_student_api_11667051',
            'sis_student_api_2345678901',
    ]:
        with open(f'{fixture_path}/{key}.json', 'r') as f:
            feed = f.read()
            if key.startswith('sis_student_api'):
                feed = json.dumps(
                    json.loads(feed)['apiResponse']['response']['any']
                    ['students'][0])
            params[key] = feed
    redshift.execute(resolve_sql_template_string(student_sql), params=params)
    yield
    for schema in ['asc_test', 'coe_test', 'student_test']:
        rds.execute(f'DROP SCHEMA {schema} CASCADE')
        redshift.execute(f'DROP SCHEMA {schema} CASCADE')

示例#7

0

显示文件

文件： metadata.py 项目： lyttam/nessie

def update_merged_feed_status(term_id, successes, failures):
    term_id = term_id or 'all'
    redshift.execute(
        'DELETE FROM {schema}.merged_feed_status WHERE sid = ANY(%s) AND term_id = %s',
        schema=_schema(),
        params=((successes + failures), term_id),
    )
    now = datetime.utcnow().isoformat()
    success_records = ['\t'.join([sid, term_id, 'success', now]) for sid in successes]
    failure_records = ['\t'.join([sid, term_id, 'failure', now]) for sid in failures]
    rows = success_records + failure_records
    s3_key = f'{get_s3_sis_api_daily_path()}/merged_feed_status.tsv'
    if not s3.upload_data('\n'.join(rows), s3_key):
        app.logger.error('Error uploading merged feed status updates to S3.')
        return
    query = resolve_sql_template_string(
        """
        COPY {redshift_schema_metadata}.merged_feed_status
            FROM '{loch_s3_sis_api_data_path}/merged_feed_status.tsv'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t'
            TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS';
        """
    )
    if not redshift.execute(query):
        app.logger.error('Error copying merged feed status updates to Redshift.')

示例#8

0

显示文件

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS student API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_student(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS student API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_profiles', s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema_student}.sis_api_profiles
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'

示例#9

0

显示文件

 def update_redshift(self, term_ids, transaction):
     if not transaction.execute(
             f'DELETE FROM {self.destination_schema}.sis_terms WHERE term_id = ANY(%s)',
             params=(term_ids, ),
     ):
         return False
     template = """COPY {redshift_schema_sis_internal}.sis_terms
                   FROM '{loch_s3_sis_api_data_path}/terms.tsv'
                   IAM_ROLE '{redshift_iam_role}'
                   DELIMITER '\\t';"""
     if not transaction.execute(resolve_sql_template_string(template)):
         return False
     return True

示例#10

0

显示文件

文件： student_schema_manager.py 项目： pfarestveit/nessie

def unload_enrollment_terms(term_ids):
    query = resolve_sql_template_string(
        """
        UNLOAD ('SELECT *, GETDATE() AS analytics_generated_at
            FROM {schema}.student_enrollment_terms
            WHERE term_id=ANY(\''{{{term_ids}}}\'')')
            TO '{loch_s3_boac_analytics_incremental_path}/student_enrollment_terms'
            IAM_ROLE '{redshift_iam_role}'
            ENCRYPTED
            DELIMITER AS '\\t'
            ALLOWOVERWRITE
            GZIP;
        """,
        schema=student_schema(),
        term_ids=','.join(term_ids),
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift unload: aborting job.')

示例#11

0

显示文件

    def upload_to_staging(self, table):
        rows = self.rows[table]
        s3_key = f'{get_s3_sis_api_daily_path()}/staging_{table}.tsv'
        app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {staging_schema}.{table}
                FROM '{loch_s3_sis_api_data_path}/staging_{table}.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            staging_schema=self.staging_schema,
            table=table,
        )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

示例#12

0

显示文件

文件： student_schema_manager.py 项目： pfarestveit/nessie

def upload_to_staging(table, rows, term_id=None):
    if term_id:
        tsv_filename = f'staging_{table}_{term_id}.tsv'
    else:
        tsv_filename = f'staging_{table}.tsv'
    s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}'
    app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
    if not s3.upload_tsv_rows(rows, s3_key):
        raise BackgroundJobError('Error on S3 upload: aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    query = resolve_sql_template_string(
        """
        COPY {staging_schema}.{table}
            FROM '{loch_s3_sis_api_data_path}/{tsv_filename}'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t';
        """,
        staging_schema=staging_schema(),
        table=table,
        tsv_filename=tsv_filename,
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')

示例#13

0

显示文件

文件： student_schema_manager.py 项目： pfarestveit/nessie

def upload_file_to_staging(table, term_file, row_count, term_id):
    tsv_filename = f'staging_{table}_{term_id}.tsv' if term_id else f'staging_{table}.tsv'
    s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}'
    app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}')
    # Be kind; rewind
    term_file.seek(0)
    if not s3.upload_data(term_file, s3_key):
        raise BackgroundJobError(
            f'Failed upload {row_count} records to s3:{s3_key}. Aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    query = resolve_sql_template_string(
        """
        COPY {staging_schema}.{table}
            FROM '{loch_s3_sis_api_data_path}/{tsv_filename}'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t';
        """,
        staging_schema=staging_schema(),
        table=table,
        tsv_filename=tsv_filename,
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')

示例#14

0

显示文件

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_degree_progress',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )

示例#15

0

显示文件

文件： import_degree_progress.py 项目： ets-berkeley-edu/nessie

    def run(self, csids=None):
        if not csids:
            all_sids = get_all_student_ids()
            if all_sids:
                csids = [row['sid'] for row in all_sids]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append(encoded_tsv_row([csid, json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress'
        ):
            raise BackgroundJobError(
                'Error truncating old staging rows: aborting job.')

        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress (
                sid VARCHAR,
                feed VARCHAR(MAX)
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/degree_progress';

            DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress;
            """, )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        redshift.execute('VACUUM; ANALYZE;')

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )

示例#16

0

显示文件

    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = str(enrollment.get('user_id'))
                    last_activity_at = str(
                        enrollment.get('last_activity_at') or '')
                    rows.append('\t'.join([
                        str(course_id), user_id,
                        str(term_id), last_activity_at,
                        json.dumps(enrollment)
                    ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            COPY {redshift_schema_student}_staging.canvas_api_enrollments
                FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t'
                TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ';
            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')

示例#17

0

显示文件

文件： generate_asc_profiles.py 项目： dkawase/nessie

    def run(self):
        app.logger.info('Starting ASC profile generation job...')
        asc_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)',
            schema=asc_schema_identifier,
        )

        profile_rows = []
        sids_for_inactive_deletion = []

        for sid, rows_for_student in groupby(asc_rows,
                                             operator.itemgetter('sid')):
            rows_for_student = list(rows_for_student)
            # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not
            # a team membership, a bit of brutal simplification is needed. Students who are active in at least
            # one sport have inactive team memberships dropped.
            any_active_athletics = reduce(
                operator.or_, [r['active'] for r in rows_for_student], False)
            if any_active_athletics:
                rows_for_student = [r for r in rows_for_student if r['active']]
                sids_for_inactive_deletion.append(sid)
            athletics_profile = {
                'athletics': [],
                'inIntensiveCohort': rows_for_student[0]['intensive'],
                'isActiveAsc': rows_for_student[0]['active'],
                'statusAsc': rows_for_student[0]['status_asc'],
            }
            for row in rows_for_student:
                athletics_profile['athletics'].append({
                    'groupCode':
                    row['group_code'],
                    'groupName':
                    row['group_name'],
                    'name':
                    row['group_name'],
                    'teamCode':
                    row['team_code'],
                    'teamName':
                    row['team_name'],
                })

            profile_rows.append(
                encoded_tsv_row([sid, json.dumps(athletics_profile)]))

        s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.student_profiles;
            COPY {redshift_schema_asc}.student_profiles
                FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(asc_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        if sids_for_inactive_deletion:
            redshift.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )
            rds.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )

        return 'ASC profile generation complete.'

示例#18

0

显示文件

文件： create_coe_schema.py 项目： dkawase/nessie

    def run(self):
        app.logger.info('Starting COE schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_coe_schema.template.sql')
        # TODO This DDL drops and recreates the internal schema before the external schema is verified. We
        # ought to set up proper staging in conjunction with verification. It's also possible that a persistent
        # external schema isn't needed.
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info('COE external schema created.')
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('COE external schema creation failed.')
        coe_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid',
            schema=internal_schema_identifier,
        )

        profile_rows = []
        index = 1
        for sid, rows_for_student in groupby(coe_rows,
                                             operator.itemgetter('sid')):
            app.logger.info(
                f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})'
            )
            index += 1
            row_for_student = list(rows_for_student)[0]
            coe_profile = {
                'advisorUid': row_for_student.get('advisor_ldap_uid'),
                'gender': row_for_student.get('gender'),
                'ethnicity': row_for_student.get('ethnicity'),
                'minority': row_for_student.get('minority'),
                'didPrep': row_for_student.get('did_prep'),
                'prepEligible': row_for_student.get('prep_eligible'),
                'didTprep': row_for_student.get('did_tprep'),
                'tprepEligible': row_for_student.get('tprep_eligible'),
                'sat1read': row_for_student.get('sat1read'),
                'sat1math': row_for_student.get('sat1math'),
                'sat2math': row_for_student.get('sat2math'),
                'inMet': row_for_student.get('in_met'),
                'gradTerm': row_for_student.get('grad_term'),
                'gradYear': row_for_student.get('grad_year'),
                'probation': row_for_student.get('probation'),
                'status': row_for_student.get('status'),
            }
            profile_rows.append(encoded_tsv_row([sid,
                                                 json.dumps(coe_profile)]))

        s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {redshift_schema_coe}.student_profiles
                FROM '{loch_s3_coe_data_path}/coe_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(coe_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        return 'COE internal schema created.'

示例#19

0

显示文件

    def run(self, csids=None, term_id=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        if not term_id:
            term_id = current_term_id()
        app.logger.info(
            f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_enrollments_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})'
            )
            feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id)
            if feed:
                success_count += 1
                rows.append('\t'.join(
                    [str(csid), str(term_id),
                     json.dumps(feed)]))
            elif feed is False:
                app.logger.info(
                    f'SID {csid} returned no enrollments for term {term_id}.')
                no_enrollments_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS enrollments API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'"
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_drops_and_midterms',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}'
                AND sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{no_enrollments_count} returned no enrollments, {failure_count} failed.'
        )

示例#20

0

显示文件

文件： import_term_gpas.py 项目： lyttam/nessie

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]

        app.logger.info(
            f'Starting term GPA import job for {len(csids)} students...')

        rows = []
        success_count = 0
        no_registrations_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_term_gpas(csid)
            if feed:
                success_count += 1
                for term_id, term_data in feed.items():
                    rows.append('\t'.join([
                        str(csid),
                        str(term_id),
                        str(term_data.get('gpa') or '0'),
                        str(term_data.get('unitsTakenForGpa') or '0')
                    ]))
            elif feed == {}:
                app.logger.info(f'No registrations found for SID {csid}.')
                no_registrations_count += 1
            else:
                failure_count += 1
                app.logger.error(f'Term GPA import failed for SID {csid}.')
            index += 1

        if success_count == 0:
            app.logger.error('Failed to import term GPAs: aborting job.')
            return False

        s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.student_term_gpas'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.student_term_gpas',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string("""
            DELETE FROM {redshift_schema_student}.student_term_gpas
                WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas);
            INSERT INTO {redshift_schema_student}.student_term_gpas
                (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas);
            TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas;
            """)
        if not redshift.execute(staging_to_destination_query):
            app.logger.error(
                'Error inserting staging entries into destination: aborting job.'
            )
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(csids, rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Failed to refresh RDS indexes.')
                return False

        return (
            f'Term GPA import completed: {success_count} succeeded, '
            f'{no_registrations_count} returned no registrations, {failure_count} failed.'
        )

示例#21

0

显示文件

文件： import_asc_athletes.py 项目： dkawase/nessie

    def run(self):
        app.logger.info(
            'ASC import: Fetch team and student athlete data from ASC API')
        api_results = get_asc_feed()
        if 'error' in api_results:
            raise BackgroundJobError(
                'ASC import: Error from external API: {}'.format(
                    api_results['error']))
        elif not api_results:
            raise BackgroundJobError('ASC import: API returned zero students')
        sync_date = api_results[0]['SyncDate']
        if sync_date != api_results[-1]['SyncDate']:
            raise BackgroundJobError(
                f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}'
            )
        rows = []
        for r in api_results:
            if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']:
                asc_code = r['SportCodeCore']
                if asc_code in SPORT_TRANSLATIONS:
                    group_code = r['SportCode']
                    data = [
                        r['SID'],
                        str(r.get('ActiveYN', 'No') == 'Yes'),
                        str(r.get('IntensiveYN', 'No') == 'Yes'),
                        r.get('SportStatus', ''),
                        group_code,
                        _unambiguous_group_name(r['Sport'], group_code),
                        SPORT_TRANSLATIONS[asc_code],
                        r['SportCore'],
                    ]
                    rows.append(encoded_tsv_row(data))
                else:
                    sid = r['SID']
                    app.logger.error(
                        f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}'
                    )

        s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv'
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Copy data in S3 file to Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.students;
            COPY {redshift_schema_asc}.students
                FROM 's3://{s3_bucket}/{s3_key}'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            s3_bucket=app.config['LOCH_S3_BUCKET'],
            s3_key=s3_key,
        )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        status = {
            'this_sync_date': sync_date,
            'api_results_count': len(api_results),
        }
        app.logger.info(
            f'ASC import: Successfully completed import job: {str(status)}')
        return status

示例#22

0

显示文件

    def run(self, load_mode='new'):
        all_sids = [row['sid'] for row in get_all_student_ids()]
        previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()}

        if load_mode == 'new':
            sids = list(set(all_sids).difference(previous_backfills))
        elif load_mode == 'batch':
            new_sids = list(set(all_sids).difference(previous_backfills))
            limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids)
            if limit > 0:
                oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)]
                sids = new_sids + oldest_backfills
            else:
                sids = new_sids
        elif load_mode == 'all':
            sids = all_sids

        app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
            'api_demographics': [],
        }
        successes, failures = self.get_registration_data_per_sids(rows, sids)
        if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0):
            raise BackgroundJobError('Failed to import registration histories: aborting job.')

        for key in rows.keys():
            s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv'
            app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}')
            if not s3.upload_tsv_rows(rows[key], s3_key):
                raise BackgroundJobError('Error on S3 upload: aborting job.')
            app.logger.info('Will copy S3 feeds into Redshift...')
            if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'):
                raise BackgroundJobError('Error truncating old staging rows: aborting job.')
            if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key):
                raise BackgroundJobError('Error on Redshift copy: aborting job.')
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {student_schema}.student_{table_key}
                    WHERE sid IN
                    (SELECT sid FROM {student_schema}_staging.student_{table_key});
                INSERT INTO {student_schema}.student_{table_key}
                    (SELECT * FROM {student_schema}_staging.student_{table_key});
                TRUNCATE TABLE {student_schema}_staging.student_{table_key};
                """,
                table_key=key,
                student_schema=student_schema(),
            )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError('Error inserting staging entries into destination: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Failed to refresh RDS indexes.')

        update_registration_import_status(successes, failures)

        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )

示例#23

0

显示文件

    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = enrollment.get('user_id')
                    last_activity_at = enrollment.get('last_activity_at') or ''
                    rows.append(
                        encoded_tsv_row([
                            course_id, user_id, term_id, last_activity_at,
                            json.dumps(enrollment)
                        ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments (
                course_id VARCHAR,
                user_id VARCHAR,
                term_id VARCHAR,
                last_activity_at TIMESTAMP,
                feed VARCHAR
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments';

            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')

示例#24

0

显示文件

文件： import_registrations_hist_enr.py 项目： dkawase/nessie

    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting registrations import job for {len(sids)} non-advisees...'
        )

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.load_concurrently(rows, sids)
        if len(successes) > 0:
            for key in rows.keys():
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Will stash {len(successes)} feeds in S3: {s3_key}')
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        'Error on S3 upload: aborting job.')
                app.logger.info('Will copy S3 feeds into Redshift...')
                if not redshift.execute(
                        f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}'
                ):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')
                if not redshift.copy_tsv_from_s3(
                        f'{self.redshift_schema}_staging.hist_enr_{key}',
                        s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {redshift_schema_student}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    INSERT INTO {redshift_schema_student}.hist_enr_{table_key}
                        (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )
        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )

示例#25

0

显示文件

    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        #
        # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less
        # likely to be triggered.)
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting import of historical registration data for {len(sids)} students...'
        )
        redshift.execute('VACUUM; ANALYZE;')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.get_registration_data_per_sids(
            rows, sids, include_demographics=False)
        for key in rows.keys():
            if len(rows[key]) > 0:
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.'
                )
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        f'Error during S3 upload: {s3_key}. Aborting job.')

                staging_table = f'{student_schema()}_staging.hist_enr_{key}'
                if not redshift.execute(f'TRUNCATE {staging_table}'):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')

                app.logger.info(
                    f'Populate {staging_table} (Redshift table) with s3:{s3_key}'
                )
                if not redshift.copy_tsv_from_s3(staging_table, s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')

                app.logger.info(
                    f'Insert student data into {student_schema()}.hist_enr_{key}'
                )
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {student_schema}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key});
                    INSERT INTO {student_schema}.hist_enr_{table_key}
                        (SELECT * FROM {student_schema}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                    student_schema=student_schema(),
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )

        redshift.execute('VACUUM; ANALYZE;')
        return (
            f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.'
        )