def _process_demographics_feeds(app_arg, chunk): with app_arg.app_context(): app_arg.logger.debug(f'{current_thread().name} will process demographics feeds chunk ({len(chunk)} records)') feeds = TemporaryFile() for sid, rows in chunk.items(): gender = None visa = None nationalities = set() ethnic_map = {} for r in rows: # TODO: Prefer gender identity once available (NS-1073) gender = r['gender'] if r['visa_type']: visa = {'status': r['visa_status'], 'type': r['visa_type']} if r['citizenship_country']: nationalities.add(r['citizenship_country']) if r['ethnic_group']: if r['ethnic_group'] not in ethnic_map: ethnic_map[r['ethnic_group']] = set() ethnic_map[r['ethnic_group']].add(r['ethnicity']) feed = { 'gender': GENDER_CODE_MAP[gender], 'ethnicities': _simplified_ethnicities(ethnic_map), 'nationalities': sorted(nationalities), 'underrepresented': not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()), 'visa': visa, } write_to_tsv_file(feeds, [sid, json.dumps(feed)]) app_arg.logger.debug(f'{current_thread().name} wrote all feeds, returning TSV tempfile') return feeds
def add_demographics_rows(sid, feed, feed_files, feed_counts): use_edl = app.config['FEATURE_FLAG_EDL_DEMOGRAPHICS'] parsed = feed if use_edl else parse_sis_demographics_api(feed) if parsed: if use_edl: filtered_ethnicities = filter_ethnicities( parsed.get('ethnicities', [])) else: filtered_ethnicities = parsed.pop('filtered_ethnicities', []) for ethn in filtered_ethnicities: feed_counts['ethnicities'] += write_to_tsv_file( feed_files['ethnicities'], [sid, ethn]) feed_counts['demographics'] += write_to_tsv_file( feed_files['demographics'], [sid, parsed.get('gender'), parsed.get('underrepresented', False)], ) visa = parsed.get('visa') if visa: feed_counts['visas'] += write_to_tsv_file( feed_files['visas'], [sid, visa.get('status'), visa.get('type')]) return parsed
def generate_academic_plans_feeds(self): app.logger.info('Staging academic plans feeds...') rows = redshift.fetch(f'SELECT * FROM {self.internal_schema}.student_academic_plan_index ORDER by sid') with TemporaryFile() as feeds: for sid, rows_for_student in groupby(rows, itemgetter('sid')): rows_for_student = list(rows_for_student) feed = self.generate_academic_plans_feed(rows_for_student) write_to_tsv_file(feeds, [sid, json.dumps(feed)]) self._upload_file_to_staging('student_academic_plans', feeds)
def build_target_feeds(self, app_arg, source_file): with app_arg.app_context(): app_arg.logger.debug( f'{current_thread().name} will process profile feeds chunk') target_file = TemporaryFile() index = None for sid, feed_components, index in self.get_pickled_feeds( source_file): # We may see results from multiple academic careers. We prefer a UGRD career if present; otherwise we look # for a non-Law career with the most recent entering term. plans = feed_components.get('plans', []) career_code = None career_admit_term = '' for plan_row in feed_components.get('plans', []): if plan_row['academic_career_cd'] == 'UGRD': career_code = 'UGRD' break elif plan_row['academic_career_cd'] in { 'UCBX', 'GRAD' } and plan_row['current_admit_term'] > career_admit_term: career_code = plan_row['academic_career_cd'] career_admit_term = plan_row['current_admit_term'] feed = { 'identifiers': [ { 'id': sid, 'type': 'student-id', }, ], } self._merge_profile(feed, feed_components.get('profile')) self._merge_holds(feed, feed_components.get('holds')) self._merge_academic_status( feed, feed_components.get('profile_terms'), career_code) self._merge_plans(feed, plans, career_code) self._merge_degrees(feed, feed_components.get('degrees')) write_to_tsv_file(target_file, [sid, json.dumps(feed)]) if index is None: app_arg.logger.warn( f'{current_thread().name} wrote no profile feeds, returning empty tempfile' ) else: app_arg.logger.debug( f'{current_thread().name} wrote {index + 1} profile feeds, returning TSV tempfile' ) return target_file
def generate_demographics_feeds(self): app.logger.info('Staging demographics feeds...') df_chunks = [] limit = 10000 offset = 0 while True: rows = get_demographics(limit=limit, offset=offset) df_chunks.append(pd.DataFrame(rows)) if len(rows) < limit: break offset += limit df = pd.concat(df_chunks, ignore_index=True) with TemporaryFile() as feeds: sids_with_multiple_visas = [] for sid, student in df.groupby('sid'): # TODO: Prefer gender identity once available (NS-1073) gender = student['gender'].drop_duplicates().dropna() if gender.count() > 1: app.logger.warn( f'Found more than one gender for SID {sid}; selecting only the first.' ) ethnic_map = student.groupby( ['ethnic_group'])['ethnicity'].agg(set).to_dict() ethnicities = self.simplified_ethnicities(ethnic_map) nationalities = student['citizenship_country'].dropna().unique( ).tolist() visa = student[['visa_type', 'visa_status']].drop_duplicates().to_dict('r') if len(visa) > 1: sids_with_multiple_visas.append(sid) feed = { 'gender': GENDER_CODE_MAP[gender.iat[0]], 'ethnicities': ethnicities, 'nationalities': nationalities, 'underrepresented': not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()), 'visa': visa[0], } write_to_tsv_file(feeds, [sid, json.dumps(feed)]) if len(sids_with_multiple_visas): app.logger.warn( f"SIDs with two or more visas: {', '.join(sids_with_multiple_visas)}" ) self._upload_file_to_staging('student_demographics', feeds)
def build_target_feeds(self, app_arg, source_file): with app_arg.app_context(): app_arg.logger.debug( f'{current_thread().name} will process demographics feeds chunk' ) target_file = TemporaryFile() index = None for sid, rows, index in self.get_pickled_feeds(source_file): gender = None visa = None nationalities = set() ethnic_map = {} for r in rows: gender = r['gender'] if r['visa_type']: visa = { 'status': r['visa_status'], 'type': r['visa_type'] } if r['citizenship_country']: nationalities.add(r['citizenship_country']) if r['ethnic_group']: if r['ethnic_group'] not in ethnic_map: ethnic_map[r['ethnic_group']] = set() ethnic_map[r['ethnic_group']].add(r['ethnicity']) feed = { 'gender': GENDER_CODE_MAP.get(gender, None), 'ethnicities': self._simplified_ethnicities(ethnic_map), 'nationalities': sorted(nationalities), 'underrepresented': not UNDERREPRESENTED_GROUPS.isdisjoint(ethnic_map.keys()), 'visa': visa, } write_to_tsv_file(target_file, [sid, json.dumps(feed)]) if index is None: app_arg.logger.warn( f'{current_thread().name} wrote no demographics feeds, returning empty tempfile' ) else: app_arg.logger.debug( f'{current_thread().name} wrote {index + 1} demographics feeds, returning TSV tempfile' ) return target_file
def generate_degree_progress_feeds(self): app.logger.info('Staging degree progress feeds...') table = 'student_degree_progress' rows = redshift.fetch(f'SELECT * FROM {self.internal_schema}.{table}_index ORDER by sid') with TemporaryFile() as feeds: for sid, rows_for_student in groupby(rows, itemgetter('sid')): rows_for_student = list(rows_for_student) report_date = rows_for_student[0].get('report_date') feed = { 'reportDate': report_date.strftime('%Y-%m-%d'), 'requirements': { row.get('requirement'): { 'name': row.get('requirement_desc'), 'status': row.get('status'), } for row in rows_for_student }, } write_to_tsv_file(feeds, [sid, json.dumps(feed)]) self._upload_file_to_staging(table, feeds)
def add_demographics_rows(sid, feed, feed_files, feed_counts): parsed = parse_sis_demographics_api(feed) if parsed: filtered_ethnicities = parsed.pop('filtered_ethnicities', []) for ethn in filtered_ethnicities: feed_counts['ethnicities'] += write_to_tsv_file( feed_files['ethnicities'], [sid, ethn]) feed_counts['demographics'] += write_to_tsv_file( feed_files['demographics'], [sid, parsed.get('gender'), parsed.get('underrepresented', False)], ) visa = parsed.get('visa') if visa: feed_counts['visas'] += write_to_tsv_file( feed_files['visas'], [sid, visa.get('status'), visa.get('type')]) return parsed
def build_target_feeds(self, app_arg, source_file): with app_arg.app_context(): app_arg.logger.debug( f'{current_thread().name} will process registration feeds chunk' ) target_file = TemporaryFile() index = None for sid, rows, index in self.get_pickled_feeds(source_file): last_registration = self._find_last_registration(rows) if last_registration: feed = self._generate_feed(last_registration) write_to_tsv_file(target_file, [sid, json.dumps(feed)]) if index is None: app_arg.logger.warn( f'{current_thread().name} wrote no registration feeds, returning empty tempfile' ) else: app_arg.logger.debug( f'{current_thread().name} wrote {index + 1} registration feeds, returning TSV tempfile' ) return target_file
def generate_student_profile_feed(self, feed_elements, advisors, feed_files, feed_counts): sid = feed_elements['sid'] uid = feed_elements['ldap_uid'] if not uid: return sis_profile = parse_merged_sis_profile(feed_elements) demographics = feed_elements.get('demographics_feed') and json.loads( feed_elements.get('demographics_feed')) if demographics: demographics = add_demographics_rows(sid, demographics, feed_files, feed_counts) advisor_feed = [] for a in advisors: advisor_feed.append({ 'uid': a['advisor_uid'], 'sid': a['advisor_sid'], 'firstName': a['advisor_first_name'], 'lastName': a['advisor_last_name'], 'email': (a['advisor_campus_email'] or a['advisor_email']), 'role': a['advisor_role'], 'title': a['advisor_title'], 'program': a['program'], 'plan': a['plan'], }) merged_profile = { 'sid': sid, 'uid': uid, 'firstName': feed_elements.get('first_name'), 'lastName': feed_elements.get('last_name'), 'name': ' '.join([ feed_elements.get('first_name'), feed_elements.get('last_name') ]), 'canvasUserId': feed_elements.get('canvas_user_id'), 'canvasUserName': feed_elements.get('canvas_user_name'), 'sisProfile': sis_profile, 'demographics': demographics, 'advisors': advisor_feed, } feed_counts['student_profiles'] += write_to_tsv_file( feed_files['student_profiles'], [sid, json.dumps(merged_profile)]) if sis_profile: first_name = merged_profile['firstName'] or '' last_name = merged_profile['lastName'] or '' level = str(sis_profile.get('level', {}).get('code') or '') gpa = str(sis_profile.get('cumulativeGPA') or '') units = str(sis_profile.get('cumulativeUnits') or '') transfer = str(sis_profile.get('transfer') or False) expected_grad_term = str( sis_profile.get('expectedGraduationTerm', {}).get('id') or '') terms_in_attendance = str( sis_profile.get('termsInAttendance', {}) or '') feed_counts['student_profile_index'] += write_to_tsv_file( feed_files['student_profile_index'], [ sid, uid, first_name, last_name, level, gpa, units, transfer, expected_grad_term, terms_in_attendance ], ) for plan in sis_profile.get('plans', []): if plan.get('status') == 'Active': feed_counts['student_majors'] += write_to_tsv_file( feed_files['student_majors'], [ sid, plan.get('program', None), plan.get('description', None) ], ) for hold in sis_profile.get('holds', []): feed_counts['student_holds'] += write_to_tsv_file( feed_files['student_holds'], [sid, json.dumps(hold)]) for intended_major in sis_profile.get('intendedMajors', []): feed_counts['intended_majors'] += write_to_tsv_file( feed_files['intended_majors'], [sid, intended_major.get('description', None)]) for plan in sis_profile.get('plansMinor', []): if plan.get('status') == 'Active': feed_counts['minors'] += write_to_tsv_file( feed_files['minors'], [sid, plan.get('description', None)]) return merged_profile