Exemplos de upload_data em Python, exemplos de nessie.externals.s3.upload_data em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: import_calnet_data.py Projeto: dkawase/nessie

def _put_instructor_data_to_s3(uids):
    app.logger.info(f'Starting CalNet import job for {len(uids)} instructors...')
    all_attributes = calnet.client(app).search_uids(uids)
    if len(uids) != len(all_attributes):
        ldap_uids = [person['uid'] for person in all_attributes]
        missing = set(uids) - set(ldap_uids)
        app.logger.warning(f'Looked for {len(uids)} instructor UIDs but only found {len(all_attributes)} : missing {missing}')

    serialized_data = ''
    for index, a in enumerate(all_attributes):
        uid = a['uid']
        affiliations = a['affiliations']
        first_name, last_name = calnet.split_sortable_name(a)
        serialized_data += json.dumps({
            'affiliations': ','.join(affiliations) if isinstance(affiliations, list) else affiliations,
            'campus_email': a['campus_email'],
            'dept_code': calnet.get_dept_code(a),
            'email': a['email'],
            'first_name': first_name,
            'last_name': last_name,
            'ldap_uid': uid,
            'csid': a['csid'],
            'title': a['title'],
        }) + '\n'
    s3.upload_data(serialized_data, f'{get_s3_calnet_daily_path()}/instructors/instructors.json')
    app.logger.info(f'Uploaded data for {len(all_attributes)} instructors')

Exemplo n.º 2

0

Exibir arquivo

def _put_booking_data_to_s3(date):
    datestamp = date.strftime('%Y-%m-%d')
    app.logger.info(f'Starting YCBM bookings import for {datestamp}...')

    bookings = ycbm_api.get_bookings_for_date(date)
    if not bookings or not len(bookings):
        app.logger.info(f'No bookings found for {datestamp}')
        return

    imported_at = utc_now().strftime('%Y-%m-%dT%H:%M:%SZ')
    serialized_data = ''
    for b in bookings:
        b['importedAt'] = imported_at
        # Make JsonSerDe schema creation easier in Redshift: transform arrays to dicts, and output one JSON record per line in text file in S3.
        answers_dict = {}
        for a in b.get('answers', []):
            if 'code' in a and 'string' in a:
                answers_dict[a['code'].lower()] = a['string']
        b['answers'] = answers_dict
        serialized_data += json.dumps(b) + '\n'
    # Upload one copy to the daily path, which we keep for a few days in S3 in case something goes wrong and we need to
    # recover an earlier run.
    s3.upload_data(
        serialized_data,
        f'{get_s3_ycbm_daily_path()}/bookings/{datestamp}/bookings.json')
    # Upload one copy to the archive path which we expect to keep as our permanent record.
    s3.upload_data(
        serialized_data,
        f"{app.config['LOCH_S3_YCBM_DATA_PATH']}/archive/{hashed_datestamp(date)}/bookings/bookings.json"
    )
    app.logger.info(
        f'Uploaded data for {len(bookings)} bookings on {datestamp}')

Exemplo n.º 3

0

Exibir arquivo

Arquivo: import_calnet_data.py Projeto: dkawase/nessie

def _put_advisee_data_to_s3(sids):
    app.logger.info(f'Starting CalNet import job for {len(sids)} advisees...')
    all_attributes = calnet.client(app).search_csids(sids)
    if len(sids) != len(all_attributes):
        ldap_sids = [person['csid'] for person in all_attributes]
        missing = set(sids) - set(ldap_sids)
        app.logger.warning(f'Looked for {len(sids)} advisee SIDs but only found {len(all_attributes)} : missing {missing}')

    serialized_data = ''
    for index, a in enumerate(all_attributes):
        sid = a['csid']
        affiliations = a['affiliations']
        first_name, last_name = calnet.split_sortable_name(a)
        # JsonSerDe in Redshift schema creation requires one and only one JSON record per line in text file in S3.
        serialized_data += json.dumps({
            'affiliations': ','.join(affiliations) if isinstance(affiliations, list) else affiliations,
            'campus_email': a['campus_email'],
            'email': a['email'],
            'first_name': first_name,
            'last_name': last_name,
            'ldap_uid': a['uid'],
            'sid': sid,
        }) + '\n'
    s3.upload_data(serialized_data, f'{get_s3_calnet_daily_path()}/advisees/advisees.json')
    app.logger.info(f'Uploaded data for {len(all_attributes)} advisees')

Exemplo n.º 4

0

Exibir arquivo

    def update_manifests(self):
        app.logger.info('Updating manifests...')

        # Because the SIS S3 copy is managed by a different application running on a different schedule,
        # it may have been made before midnight by Nessie-time.
        s3_sis_daily = get_s3_sis_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = get_s3_sis_daily_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_sis_daily):
                raise BackgroundJobError('No timely SIS S3 data found')
            else:
                app.logger.info('Falling back to SIS S3 daily data for yesterday')

        courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses', full_objects=True)
        courses_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses', full_objects=True)
        enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily + '/enrollments', full_objects=True)
        enrollments_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments', full_objects=True)

        def deduplicate(prefix, s3list):
            filename_map = {}
            for s3obj in s3list:
                m = re.match(r'.+/(.+\.gz)', s3obj['Key'])
                if m:
                    filename_map[m[1]] = s3obj
            for term_id in reverse_term_ids(include_future_terms=True):
                filename = f'{prefix}-{term_id}.gz'
                if filename not in filename_map:
                    raise BackgroundJobError(f'Expected filename {filename} not found in S3, aborting')
            return list(filename_map.values())

        all_courses = deduplicate('courses', courses_daily + courses_historical)
        all_enrollments = deduplicate('enrollments', enrollments_daily + enrollments_historical)

        def to_manifest_entry(_object):
            return {
                'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}",
                'meta': {'content_length': _object['Size']},
            }

        def to_manifest(objects):
            return {
                'entries': [to_manifest_entry(o) for o in objects],
            }

        courses_manifest = json.dumps(to_manifest(all_courses))
        enrollments_manifest = json.dumps(to_manifest(all_enrollments))
        courses_result = s3.upload_data(courses_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json')
        enrollments_result = s3.upload_data(enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/enrollments.json')
        return courses_result and enrollments_result

Exemplo n.º 5

0

Exibir arquivo

Arquivo: metadata.py Projeto: lyttam/nessie

def update_merged_feed_status(term_id, successes, failures):
    term_id = term_id or 'all'
    redshift.execute(
        'DELETE FROM {schema}.merged_feed_status WHERE sid = ANY(%s) AND term_id = %s',
        schema=_schema(),
        params=((successes + failures), term_id),
    )
    now = datetime.utcnow().isoformat()
    success_records = ['\t'.join([sid, term_id, 'success', now]) for sid in successes]
    failure_records = ['\t'.join([sid, term_id, 'failure', now]) for sid in failures]
    rows = success_records + failure_records
    s3_key = f'{get_s3_sis_api_daily_path()}/merged_feed_status.tsv'
    if not s3.upload_data('\n'.join(rows), s3_key):
        app.logger.error('Error uploading merged feed status updates to S3.')
        return
    query = resolve_sql_template_string(
        """
        COPY {redshift_schema_metadata}.merged_feed_status
            FROM '{loch_s3_sis_api_data_path}/merged_feed_status.tsv'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t'
            TIMEFORMAT 'YYYY-MM-DDTHH:MI:SS';
        """
    )
    if not redshift.execute(query):
        app.logger.error('Error copying merged feed status updates to Redshift.')

Exemplo n.º 6

0

Exibir arquivo

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS student API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_student(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS student API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_profiles', s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema_student}.sis_api_profiles
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'

Exemplo n.º 7

0

Exibir arquivo

    def run(self, csids=None):
        if not csids:
            csids = _get_advisee_sids_without_photos()
        id_mappings = get_advisee_ids(csids=csids)
        id_mappings_by_sid = {r['sid']: r['ldap_uid'] for r in id_mappings}

        app.logger.info(
            f'Starting student photo import job for {len(csids)} students...')

        successes = []
        failures = []
        photo_not_found = []
        index = 0

        for csid in csids:
            index += 1
            app.logger.info(
                f'Fetching photo for SID {csid}, ({index} of {len(csids)})')
            uid = id_mappings_by_sid.get(csid)
            if not uid:
                app.logger.error(f'No UID found for SID {csid}.')
                failures.append(csid)
                continue

            photo = get_cal1card_photo(uid)
            if photo:
                s3_photo_key = f"{app.config['LOCH_S3_CAL1CARD_PHOTOS_PATH']}/{uid}.jpg"
                if s3.upload_data(photo,
                                  s3_photo_key,
                                  bucket=app.config['LOCH_S3_PUBLIC_BUCKET']):
                    successes.append(csid)
                else:
                    app.logger.error(f'Photo upload failed for SID {csid}.')
                    failures.append(csid)
            elif photo is False:
                app.logger.info(f'No photo returned for SID {csid}.')
                photo_not_found.append(csid)
            elif photo is None:
                app.logger.error(f'Photo import failed for SID {csid}.')
                failures.append(csid)

            sleep(app.config['CAL1CARD_PHOTO_API_THROTTLE'])

        if (len(successes) == 0) and (len(photo_not_found)
                                      == 0) and (len(failures) > 0):
            raise BackgroundJobError('Failed to import student photos.')
        else:
            update_photo_import_status(successes, failures, photo_not_found)
            status = 'Student photo import completed: '
            if len(successes):
                status += f'{len(successes)} succeeded, '
            if len(photo_not_found):
                status += f'{len(photo_not_found)} had no photo available, '
            status += f'{len(failures)} failed.'
            return status

Exemplo n.º 8

0

Exibir arquivo

Arquivo: import_calnet_data.py Projeto: lyttam/nessie

def _put_calnet_data_to_s3(sids):
    all_attributes = calnet.client(app).search_csids(sids)
    if len(sids) != len(all_attributes):
        ldap_sids = [l['csid'] for l in all_attributes]
        missing = set(sids) - set(ldap_sids)
        app.logger.warning(
            f'Looked for {len(sids)} SIDs but only found {len(all_attributes)} : missing {missing}'
        )

    serialized_data = ''
    total_count = len(all_attributes)
    for index, a in enumerate(all_attributes):
        sid = a['csid']
        app.logger.info(
            f'CalNet import: Fetch attributes of student {sid} ({index + 1} of {total_count})'
        )
        affiliations = a['affiliations']
        first_name, last_name = _split_sortable_name(a)
        # JsonSerDe in Redshift schema creation requires one and only one JSON record per line in text file in S3.
        serialized_data += json.dumps({
            'affiliations':
            ','.join(affiliations)
            if isinstance(affiliations, list) else affiliations,
            'campus_email':
            a['campus_email'],
            'email':
            a['email'],
            'first_name':
            first_name,
            'last_name':
            last_name,
            'ldap_uid':
            a['uid'],
            'sid':
            sid,
        }) + '\n'
    s3.upload_data(serialized_data,
                   f'{get_s3_calnet_daily_path()}/persons.json')

Exemplo n.º 9

0

Exibir arquivo

    def upload_to_staging(self, table):
        rows = self.rows[table]
        s3_key = f'{get_s3_sis_api_daily_path()}/staging_{table}.tsv'
        app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {staging_schema}.{table}
                FROM '{loch_s3_sis_api_data_path}/staging_{table}.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            staging_schema=self.staging_schema,
            table=table,
        )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

Exemplo n.º 10

0

Exibir arquivo

Arquivo: student_schema_manager.py Projeto: pfarestveit/nessie

def upload_file_to_staging(table, term_file, row_count, term_id):
    tsv_filename = f'staging_{table}_{term_id}.tsv' if term_id else f'staging_{table}.tsv'
    s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}'
    app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}')
    # Be kind; rewind
    term_file.seek(0)
    if not s3.upload_data(term_file, s3_key):
        raise BackgroundJobError(
            f'Failed upload {row_count} records to s3:{s3_key}. Aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    query = resolve_sql_template_string(
        """
        COPY {staging_schema}.{table}
            FROM '{loch_s3_sis_api_data_path}/{tsv_filename}'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t';
        """,
        staging_schema=staging_schema(),
        table=table,
        tsv_filename=tsv_filename,
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')

Exemplo n.º 11

0

Exibir arquivo

    def run(self, term_ids=None):
        if not term_ids:
            term_ids = reverse_term_ids()
        app.logger.info(
            f'Starting SIS terms API import job for {len(term_ids)} terms...')

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for term_id in term_ids:
            app.logger.info(
                f'Fetching SIS terms API for term id {term_id} ({index} of {len(term_ids)})'
            )
            feed = sis_terms_api.get_term(term_id)
            if feed:
                success_count += 1
                for academic_career_term in feed:
                    for session in academic_career_term.get('sessions', []):
                        rows.append(
                            '\t'.join([
                                academic_career_term.get('id', ''),
                                academic_career_term.get('name', ''),
                                academic_career_term.get('academicCareer',
                                                         {}).get('code', ''),
                                academic_career_term.get('beginDate', ''),
                                academic_career_term.get('endDate', ''),
                                session.get('id', ''),
                                session.get('name', ''),
                                session.get('beginDate', ''),
                                session.get('endDate', ''),
                            ]), )
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS terms API import failed for term id {term_id}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/terms.tsv'
        app.logger.info(
            f'Will stash {len(rows)} rows from {success_count} feeds in S3: {s3_key}'
        )
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        with redshift.transaction() as transaction:
            if self.update_redshift(term_ids, transaction):
                transaction.commit()
                app.logger.info('Updated Redshift.')
            else:
                transaction.rollback()
                app.logger.error('Failed to update Redshift.')
                return False

        with rds.transaction() as transaction:
            if self.update_rds(rows, term_ids, transaction):
                transaction.commit()
                app.logger.info('Updated RDS.')
            else:
                transaction.rollback()
                app.logger.error('Failed to update RDS.')
                return False

        return f'SIS terms API import job completed: {success_count} succeeded, {failure_count} failed.'

Exemplo n.º 12

0

Exibir arquivo

Arquivo: generate_asc_profiles.py Projeto: lyttam/nessie

    def run(self):
        app.logger.info(f'Starting ASC profile generation job...')
        asc_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)',
            schema=asc_schema_identifier,
        )

        profile_rows = []
        sids_for_inactive_deletion = []

        for sid, rows_for_student in groupby(asc_rows, operator.itemgetter('sid')):
            rows_for_student = list(rows_for_student)
            # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not
            # a team membership, a bit of brutal simplification is needed. Students who are active in at least
            # one sport have inactive team memberships dropped.
            any_active_athletics = reduce(operator.or_, [r['active'] for r in rows_for_student], False)
            if any_active_athletics:
                rows_for_student = [r for r in rows_for_student if r['active']]
                sids_for_inactive_deletion.append(sid)
            athletics_profile = {
                'athletics': [],
                'inIntensiveCohort': rows_for_student[0]['intensive'],
                'isActiveAsc': rows_for_student[0]['active'],
                'statusAsc': rows_for_student[0]['status_asc'],
            }
            for row in rows_for_student:
                athletics_profile['athletics'].append({
                    'groupCode': row['group_code'],
                    'groupName': row['group_name'],
                    'name': row['group_name'],
                    'teamCode': row['team_code'],
                    'teamName': row['team_name'],
                })

            profile_rows.append('\t'.join([str(sid), json.dumps(athletics_profile)]))

        s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv'
        app.logger.info(f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(profile_rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.student_profiles;
            COPY {redshift_schema_asc}.student_profiles
                FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            VACUUM;
            ANALYZE;
            """,
        )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(asc_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Error refreshing RDS indexes.')
                return False

        if sids_for_inactive_deletion:
            redshift.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion,),
            )
            rds.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion,),
            )

        return 'ASC profile generation complete.'

Exemplo n.º 13

0

Exibir arquivo

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_degree_progress',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )

Exemplo n.º 14

0

Exibir arquivo

Arquivo: create_coe_schema.py Projeto: lyttam/nessie

    def run(self):
        app.logger.info(f'Starting COE schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_coe_schema.template.sql')
        # TODO This DDL drops and recreates the internal schema before the external schema is verified. We
        # ought to set up proper staging in conjunction with verification. It's also possible that a persistent
        # external schema isn't needed.
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info(f'COE external schema created.')
            if not verify_external_schema(external_schema, resolved_ddl):
                return False
        else:
            app.logger.error(f'COE external schema creation failed.')
            return False
        coe_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid',
            schema=internal_schema_identifier,
        )

        profile_rows = []
        index = 1
        for sid, rows_for_student in groupby(coe_rows,
                                             operator.itemgetter('sid')):
            app.logger.info(
                f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})'
            )
            index += 1
            row_for_student = list(rows_for_student)[0]
            coe_profile = {
                'advisorUid': row_for_student.get('advisor_ldap_uid'),
                'gender': row_for_student.get('gender'),
                'ethnicity': row_for_student.get('ethnicity'),
                'minority': row_for_student.get('minority'),
                'didPrep': row_for_student.get('did_prep'),
                'prepEligible': row_for_student.get('prep_eligible'),
                'didTprep': row_for_student.get('did_tprep'),
                'tprepEligible': row_for_student.get('tprep_eligible'),
            }
            profile_rows.append('\t'.join([str(sid), json.dumps(coe_profile)]))

        s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(profile_rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {redshift_schema_coe}.student_profiles
                FROM '{loch_s3_coe_data_path}/coe_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            VACUUM;
            ANALYZE;
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(coe_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Error refreshing RDS indexes.')
                return False

        return 'COE internal schema created.'

Exemplo n.º 15

0

Exibir arquivo

    def run(self, csids=None, term_id=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        if not term_id:
            term_id = current_term_id()
        app.logger.info(
            f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_enrollments_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})'
            )
            feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id)
            if feed:
                success_count += 1
                rows.append('\t'.join(
                    [str(csid), str(term_id),
                     json.dumps(feed)]))
            elif feed is False:
                app.logger.info(
                    f'SID {csid} returned no enrollments for term {term_id}.')
                no_enrollments_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS enrollments API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'"
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_drops_and_midterms',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}'
                AND sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{no_enrollments_count} returned no enrollments, {failure_count} failed.'
        )

Exemplo n.º 16

0

Exibir arquivo

Arquivo: import_term_gpas.py Projeto: lyttam/nessie

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]

        app.logger.info(
            f'Starting term GPA import job for {len(csids)} students...')

        rows = []
        success_count = 0
        no_registrations_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_term_gpas(csid)
            if feed:
                success_count += 1
                for term_id, term_data in feed.items():
                    rows.append('\t'.join([
                        str(csid),
                        str(term_id),
                        str(term_data.get('gpa') or '0'),
                        str(term_data.get('unitsTakenForGpa') or '0')
                    ]))
            elif feed == {}:
                app.logger.info(f'No registrations found for SID {csid}.')
                no_registrations_count += 1
            else:
                failure_count += 1
                app.logger.error(f'Term GPA import failed for SID {csid}.')
            index += 1

        if success_count == 0:
            app.logger.error('Failed to import term GPAs: aborting job.')
            return False

        s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.student_term_gpas'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.student_term_gpas',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string("""
            DELETE FROM {redshift_schema_student}.student_term_gpas
                WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas);
            INSERT INTO {redshift_schema_student}.student_term_gpas
                (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas);
            TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas;
            """)
        if not redshift.execute(staging_to_destination_query):
            app.logger.error(
                'Error inserting staging entries into destination: aborting job.'
            )
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(csids, rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Failed to refresh RDS indexes.')
                return False

        return (
            f'Term GPA import completed: {success_count} succeeded, '
            f'{no_registrations_count} returned no registrations, {failure_count} failed.'
        )

Exemplo n.º 17

0

Exibir arquivo

 def _upload_data_to_s3(self, daily_path, historical_path):
     s3.upload_data('some futuristic course data',
                    f'{daily_path}/courses/courses-2182.gz')
     s3.upload_data('some futuristic enrollment data',
                    f'{daily_path}/enrollments/enrollments-2182.gz')
     s3.upload_data('some new course data',
                    f'{daily_path}/courses/courses-2178.gz')
     s3.upload_data('some new enrollment data',
                    f'{daily_path}/enrollments/enrollments-2178.gz')
     s3.upload_data('some old course data',
                    f'{historical_path}/courses/courses-2175.gz')
     s3.upload_data('some old enrollment data',
                    f'{historical_path}/enrollments/enrollments-2175.gz')
     s3.upload_data('some older course data',
                    f'{historical_path}/courses/courses-2172.gz')
     s3.upload_data('some older enrollment data',
                    f'{historical_path}/enrollments/enrollments-2172.gz')
     s3.upload_data('some perfectly antique course data',
                    f'{historical_path}/courses/courses-2168.gz')
     s3.upload_data('some perfectly antique enrollment data',
                    f'{historical_path}/enrollments/enrollments-2168.gz')
     s3.upload_data('some ancient course data',
                    f'{historical_path}/courses/courses-2165.gz')
     s3.upload_data('some ancient enrollment data',
                    f'{historical_path}/enrollments/enrollments-2165.gz')

Exemplo n.º 18

0

Exibir arquivo

Arquivo: import_asc_athletes.py Projeto: lyttam/nessie

    def run(self):
        app.logger.info(
            f'ASC import: Fetch team and student athlete data from ASC API')
        api_results = get_asc_feed()
        if 'error' in api_results:
            app.logger.error('ASC import: Error from external API: {}'.format(
                api_results['error']))
            status = False
        elif not api_results:
            app.logger.error('ASC import: API returned zero students')
            status = False
        else:
            sync_date = api_results[0]['SyncDate']
            if sync_date != api_results[-1]['SyncDate']:
                app.logger.error(
                    f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}'
                )
                status = False
            else:
                rows = []
                for r in api_results:
                    if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r[
                            'SportCode']:
                        asc_code = r['SportCodeCore']
                        if asc_code in SPORT_TRANSLATIONS:
                            group_code = r['SportCode']
                            data = [
                                r['SID'],
                                str(r.get('ActiveYN', 'No') == 'Yes'),
                                str(r.get('IntensiveYN', 'No') == 'Yes'),
                                r.get('SportStatus', ''),
                                group_code,
                                _unambiguous_group_name(
                                    r['Sport'], group_code),
                                SPORT_TRANSLATIONS[asc_code],
                                r['SportCore'],
                            ]
                            rows.append('\t'.join(data))
                        else:
                            sid = r['SID']
                            app.logger.error(
                                f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}'
                            )

                s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv'
                if not s3.upload_data('\n'.join(rows), s3_key):
                    app.logger.error('Error on S3 upload: aborting job.')
                    return False

                status = {
                    'this_sync_date': sync_date,
                    'api_results_count': len(api_results),
                }
                app.logger.info('Copy data in S3 file to Redshift...')
                query = resolve_sql_template_string(
                    """
                    TRUNCATE {redshift_schema_asc}.students;
                    COPY {redshift_schema_asc}.students
                        FROM 's3://{s3_bucket}/{s3_key}'
                        IAM_ROLE '{redshift_iam_role}'
                        DELIMITER '\\t';
                    VACUUM;
                    ANALYZE;
                    """,
                    s3_bucket=app.config['LOCH_S3_BUCKET'],
                    s3_key=s3_key,
                )
                if not redshift.execute(query):
                    app.logger.error('Error on Redshift copy: aborting job.')
                    return False
                app.logger.info(
                    f'ASC import: Successfully completed import job: {str(status)}'
                )
        return status

Exemplo n.º 19

0

Exibir arquivo

    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = str(enrollment.get('user_id'))
                    last_activity_at = str(
                        enrollment.get('last_activity_at') or '')
                    rows.append('\t'.join([
                        str(course_id), user_id,
                        str(term_id), last_activity_at,
                        json.dumps(enrollment)
                    ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            COPY {redshift_schema_student}_staging.canvas_api_enrollments
                FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t'
                TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ';
            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')

Exemplo n.º 20

0

Exibir arquivo

    def test_fallback_update_manifests(self, app):
        """Uses yesterday's news if today's is unavailable."""
        with mock_s3(app):
            yesterday = datetime.now() - timedelta(days=1)
            daily_path = get_s3_sis_daily_path(yesterday)
            historical_path = app.config[
                'LOCH_S3_SIS_DATA_PATH'] + '/historical'
            manifest_path = app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests'

            s3.upload_data('some new course data',
                           f'{daily_path}/courses/courses-aaa.gz')
            s3.upload_data('some more new course data',
                           f'{daily_path}/courses/courses-bbb.gz')
            s3.upload_data('some new enrollment data',
                           f'{daily_path}/enrollments/enrollments-ccc.gz')
            s3.upload_data('some old course data',
                           f'{historical_path}/courses/courses-ddd.gz')
            s3.upload_data(
                'some old enrollment data',
                f'{historical_path}/enrollments/enrollments-eee.gz')
            s3.upload_data(
                'some perfectly antique enrollment data',
                f'{historical_path}/enrollments/enrollments-fff.gz')

            assert CreateSisSchema().update_manifests()

            courses_manifest = json.loads(
                s3.get_object_text(manifest_path + '/courses.json'))
            assert len(courses_manifest['entries']) == 3
            assert courses_manifest['entries'][0][
                'url'] == f's3://{app.config["LOCH_S3_BUCKET"]}/{daily_path}/courses/courses-aaa.gz'
            assert courses_manifest['entries'][0]['meta'][
                'content_length'] == 20

            enrollments_manifest = json.loads(
                s3.get_object_text(manifest_path + '/enrollments.json'))
            assert len(enrollments_manifest['entries']) == 3
            assert (
                enrollments_manifest['entries'][2]['url'] ==
                f's3://{app.config["LOCH_S3_BUCKET"]}/{historical_path}/enrollments/enrollments-fff.gz'
            )
            assert enrollments_manifest['entries'][2]['meta'][
                'content_length'] == 38