def ap(noble_aps): #initialize new df ap_tests = pd.DataFrame() # add id ap_tests['noble_powerschool_id'] = noble_aps['student_number'] # add date ap_tests['date'] = pd.to_datetime(noble_aps['test_date']) # add score ap_tests['score'] = noble_aps['numscore'] # load ap dictionary from manually coded csv subject_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/APsubjecttranslation.csv') # do conversions from freetext to coded options noble_aps.testname = noble_aps.testname.apply(lambda x: x.replace('AP ', '')) #remove the 'AP ' ap_tests['subject'] = noble_aps.testname.fillna('missing').apply(lambda x: convert_free_text(subject_dict, x)) return ap_tests
def ap(noble_aps): #initialize new df ap_tests = pd.DataFrame() # add id ap_tests['noble_powerschool_id'] = noble_aps['student_number'] # add date ap_tests['date'] = pd.to_datetime(noble_aps['test_date']) # add score ap_tests['score'] = noble_aps['numscore'] # load ap dictionary from manually coded csv subject_dict = create_conversion_dict( config.PERSISTENCE_PATH + '/code/etl/mappers/APsubjecttranslation.csv') # do conversions from freetext to coded options noble_aps.testname = noble_aps.testname.apply( lambda x: x.replace('AP ', '')) #remove the 'AP ' ap_tests['subject'] = noble_aps.testname.fillna('missing').apply( lambda x: convert_free_text(subject_dict, x)) return ap_tests
def enrollments(enrollment): # initialize clean dataframe clean_enrollment = pd.DataFrame() # add student and college ids clean_enrollment['noble_student_sf_id'] = enrollment['Student__c'] clean_enrollment['noble_college_sf_id'] = enrollment['College__c'] # convert date columns to datetime to do date cleaning enrollment['Start_Date__c'] = pd.to_datetime(enrollment['Start_Date__c']) enrollment['End_Date__c'] = pd.to_datetime(enrollment['End_Date__c']) enrollment['Date_Last_Verified__c'] = pd.to_datetime(enrollment['Date_Last_Verified__c']) #remove weird dates, add to new df clean_enrollment['start_date'] = enrollment.Start_Date__c.apply(wrong_dates_to_null) clean_enrollment['end_date'] = enrollment.End_Date__c.apply(wrong_dates_to_null) clean_enrollment['date_last_verified'] = enrollment.Date_Last_Verified__c.apply(wrong_dates_to_null) # add status, data source, and degree type as is clean_enrollment['status'] = enrollment.Status__c clean_enrollment['data_source'] = enrollment.Data_Source__c # Living on campus clean_enrollment['living_on_campus'] = np.nan # Degree type, without apostrophes clean_enrollment['degree_type'] = enrollment.Degree_Type__c.dropna().apply(lambda x: str(x).replace("'","")) # load degree and major dictionaries from manually coded csvs degree_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/degreetranslation.csv') major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollment['degree_subject'] = enrollment.Degree_Text__c.fillna('missing').apply(lambda x: convert_free_text(degree_dict, x)) clean_enrollment['major'] = enrollment.Major_Text__c.fillna('missing').apply(lambda x: convert_free_text(major_dict, x)) # convert withdrawal reasons to several boolean categories def boolean_withdrawal(dummy_code, input_code): if input_code == 'missing': return np.nan else: codes = input_code.split(';') dummy_booleans = [True if code == dummy_code else False for code in codes] if sum(dummy_booleans) > 0: return True else: return False withdrawal_codes = ['Financial', 'Academic', 'Motivational', 'Family', 'Health', 'Social', 'Racial Conflict'] for dummy_code in withdrawal_codes: clean_enrollment['withdrawal_reason_' + "_".join(dummy_code.lower().split())] = enrollment.Withdrawal_code__c.fillna('missing').apply(lambda x: boolean_withdrawal(dummy_code, x)) clean_enrollment.rename(columns={'withdrawal_reason_racial_conflict': 'withdrawal_reason_racial'}, inplace=True) # Return clean df return clean_enrollment
def enrollments_table(kipp_nj_enrollments): '''Cleans KIPP NJ enrollment data to match our database schema''' # remove the did not enroll, other, and deferred enrollment types clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(['Other', 'Did Not Enroll', 'Deferred'])] # rename Withdrawn to withdrew, matriculated to matriculating clean_enrollments.status = clean_enrollments.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x)) # made dates into dates clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date) clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date) clean_enrollments.date_last_verified = pd.to_datetime(clean_enrollments.date_last_verified) # deal with degree type #remove high school diploma and GED, only interested in college enrollments clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.isin(['High School Diploma', 'GED'])] clean_enrollments.degree_type = clean_enrollments.degree_type.apply(lambda x: map_value_from_dict(degree_fixed_mapping, x)) # clean up degree subject clean_enrollments.degree_subject= clean_enrollments.degree_type.combine(clean_enrollments.degree_subject, func = code_degree_subject) # clean up major major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollments['major'] = clean_enrollments.major.fillna('missing').apply(lambda x: convert_free_text(major_dict, x)) #map transfer reasons to the reasons we keep track of withdrawal_reasons = pd.get_dummies(clean_enrollments.transfer_reason__c).astype(bool) withdrawal_reasons.columns = ['withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social'] withdrawal_reasons.drop(['withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career'], axis = 1,inplace = True) # join the columns back into the original clean_enrollments = clean_enrollments.join(withdrawal_reasons) clean_enrollments['withdrawal_reason_motivational'] = np.nan clean_enrollments['withdrawal_reason_family'] = np.nan clean_enrollments['withdrawal_reason_health'] = np.nan clean_enrollments['withdrawal_reason_racial'] = np.nan # Drop invalid IPEDS id clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].convert_objects(convert_numeric=True) clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].apply(int_with_NaN_tostr) clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],axis = 1,inplace = True) clean_enrollments.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_ncesid': 'ipedsid'}, inplace=True) return clean_enrollments
def enrollments(enrollment): # initialize clean dataframe clean_enrollment = pd.DataFrame() # add student and college ids clean_enrollment['noble_student_sf_id'] = enrollment['Student__c'] clean_enrollment['noble_college_sf_id'] = enrollment['College__c'] # convert date columns to datetime to do date cleaning enrollment['Start_Date__c'] = pd.to_datetime(enrollment['Start_Date__c']) enrollment['End_Date__c'] = pd.to_datetime(enrollment['End_Date__c']) enrollment['Date_Last_Verified__c'] = pd.to_datetime( enrollment['Date_Last_Verified__c']) #remove weird dates, add to new df clean_enrollment['start_date'] = enrollment.Start_Date__c.apply( wrong_dates_to_null) clean_enrollment['end_date'] = enrollment.End_Date__c.apply( wrong_dates_to_null) clean_enrollment[ 'date_last_verified'] = enrollment.Date_Last_Verified__c.apply( wrong_dates_to_null) # add status, data source, and degree type as is clean_enrollment['status'] = enrollment.Status__c clean_enrollment['data_source'] = enrollment.Data_Source__c # Living on campus clean_enrollment['living_on_campus'] = np.nan # Degree type, without apostrophes clean_enrollment['degree_type'] = enrollment.Degree_Type__c.dropna().apply( lambda x: str(x).replace("'", "")) # load degree and major dictionaries from manually coded csvs degree_dict = create_conversion_dict( config.PERSISTENCE_PATH + '/code/etl/mappers/degreetranslation.csv') major_dict = create_conversion_dict( config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollment['degree_subject'] = enrollment.Degree_Text__c.fillna( 'missing').apply(lambda x: convert_free_text(degree_dict, x)) clean_enrollment['major'] = enrollment.Major_Text__c.fillna( 'missing').apply(lambda x: convert_free_text(major_dict, x)) # convert withdrawal reasons to several boolean categories def boolean_withdrawal(dummy_code, input_code): if input_code == 'missing': return np.nan else: codes = input_code.split(';') dummy_booleans = [ True if code == dummy_code else False for code in codes ] if sum(dummy_booleans) > 0: return True else: return False withdrawal_codes = [ 'Financial', 'Academic', 'Motivational', 'Family', 'Health', 'Social', 'Racial Conflict' ] for dummy_code in withdrawal_codes: clean_enrollment['withdrawal_reason_' + "_".join(dummy_code.lower( ).split())] = enrollment.Withdrawal_code__c.fillna('missing').apply( lambda x: boolean_withdrawal(dummy_code, x)) clean_enrollment.rename(columns={ 'withdrawal_reason_racial_conflict': 'withdrawal_reason_racial' }, inplace=True) # Return clean df return clean_enrollment
def enrollments_table(kipp_nj_enrollments): '''Cleans KIPP NJ enrollment data to match our database schema''' # remove the did not enroll, other, and deferred enrollment types clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin( ['Other', 'Did Not Enroll', 'Deferred'])] # rename Withdrawn to withdrew, matriculated to matriculating clean_enrollments.status = clean_enrollments.status.apply( lambda x: map_value_from_dict(status_fixed_mapping, x)) # made dates into dates clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date) clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date) clean_enrollments.date_last_verified = pd.to_datetime( clean_enrollments.date_last_verified) # deal with degree type #remove high school diploma and GED, only interested in college enrollments clean_enrollments = clean_enrollments[~clean_enrollments.degree_type. isin(['High School Diploma', 'GED'])] clean_enrollments.degree_type = clean_enrollments.degree_type.apply( lambda x: map_value_from_dict(degree_fixed_mapping, x)) # clean up degree subject clean_enrollments.degree_subject = clean_enrollments.degree_type.combine( clean_enrollments.degree_subject, func=code_degree_subject) # clean up major major_dict = create_conversion_dict( config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollments['major'] = clean_enrollments.major.fillna( 'missing').apply(lambda x: convert_free_text(major_dict, x)) #map transfer reasons to the reasons we keep track of withdrawal_reasons = pd.get_dummies( clean_enrollments.transfer_reason__c).astype(bool) withdrawal_reasons.columns = [ 'withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social' ] withdrawal_reasons.drop([ 'withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career' ], axis=1, inplace=True) # join the columns back into the original clean_enrollments = clean_enrollments.join(withdrawal_reasons) clean_enrollments['withdrawal_reason_motivational'] = np.nan clean_enrollments['withdrawal_reason_family'] = np.nan clean_enrollments['withdrawal_reason_health'] = np.nan clean_enrollments['withdrawal_reason_racial'] = np.nan # Drop invalid IPEDS id clean_enrollments['college_ncesid'] = clean_enrollments[ 'college_ncesid'].convert_objects(convert_numeric=True) clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan clean_enrollments['college_ncesid'] = clean_enrollments[ 'college_ncesid'].apply(int_with_NaN_tostr) clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'], axis=1, inplace=True) clean_enrollments.rename(columns={ 'student_salesforce_id': 'kipp_nj_sf_id', 'college_ncesid': 'ipedsid' }, inplace=True) return clean_enrollments