def students_table(kipp_nj_students): '''Cleans KIPP NJ student data to match our database schema''' # we don't load names into our DB clean_students = kipp_nj_students.drop('name', axis =1) #rename columns correctly clean_students.rename(columns={'contact_id':'kipp_nj_sf_id', 'powerschool_id':'kipp_nj_powerschool_id'}, inplace=True) #make DOB a date clean_students.date_of_birth = pd.to_datetime(clean_students.date_of_birth) #remap ethnicities clean_students.ethnicity = clean_students.ethnicity.apply(lambda x: map_value_from_dict(ethnicity_fixed_mapping, x)) # name of partner organization clean_students['network'] = 'KIPP_NJ' #get rid of commas in income values (confuses psycopg2) clean_students.family_income_bracket = clean_students.family_income_bracket.str.replace(',','') # change number in household to int clean_students.number_in_household = clean_students.dropna().number_in_household.astype(int) clean_students.number_in_household = clean_students.number_in_household.apply(int_with_NaN_tostr) # deal with missing 0 on zip codes clean_students.zip = clean_students.zip.dropna().astype(int).astype(str).apply(fix_zip) #drop the student with duplicated salesforce ids clean_students = clean_students.drop_duplicates(subset = ['kipp_nj_powerschool_id']) #reorder columns clean_students = clean_students[['kipp_nj_sf_id', 'cps_id', 'kipp_nj_powerschool_id', 'network', 'date_of_birth', 'ethnicity', 'is_female', 'ever_special_ed', 'ever_free_lunch', 'family_income_bracket', 'number_in_household', 'is_first_gen', 'zip', 'fafsa_efc']] #return clean df return clean_students
def students(noble_students): # initialize clean dataframe clean_students = pd.DataFrame() clean_students['noble_sf_id'] = noble_students.Id clean_students['cps_id'] = noble_students.Network_Student_ID__c clean_students['network'] = 'Noble' # add DOB noble_students['Birthdate'] = pd.to_datetime(noble_students['Birthdate'], format = '%m/%d/%Y') clean_students['date_of_birth'] = noble_students.Birthdate.apply(wrong_birthdates_to_null) # add gender as a boolean clean_students['is_female'] = noble_students.Gender__c.fillna('').apply(code_gender) # add fixed ethnicity clean_students['ethnicity'] = noble_students['Ethnicity__c'].apply(lambda x: map_value_from_dict(ethnicity_fixed_mapping, x)) ## add ever_special need and free lunch as booleans clean_students['ever_special_ed'] = noble_students['Special_Education__c'] clean_students['ever_free_lunch'] = noble_students['Low_Income__c'] clean_students['is_first_gen'] = noble_students['First_Generation_College_Student__c'] # add EFC clean_students['fafsa_efc'] = noble_students.EFC_from_FAFSA__c # add all the empty columns for now missing_cols = ['family_income_bracket','number_in_household','zip'] fill_empty_cols(clean_students,missing_cols) # reorder columns return clean_students
def students(noble_students): # initialize clean dataframe clean_students = pd.DataFrame() clean_students['noble_sf_id'] = noble_students.Id clean_students['cps_id'] = noble_students.Network_Student_ID__c clean_students['network'] = 'Noble' # add DOB noble_students['Birthdate'] = pd.to_datetime(noble_students['Birthdate'], format='%m/%d/%Y') clean_students['date_of_birth'] = noble_students.Birthdate.apply( wrong_birthdates_to_null) # add gender as a boolean clean_students['is_female'] = noble_students.Gender__c.fillna('').apply( code_gender) # add fixed ethnicity clean_students['ethnicity'] = noble_students['Ethnicity__c'].apply( lambda x: map_value_from_dict(ethnicity_fixed_mapping, x)) ## add ever_special need and free lunch as booleans clean_students['ever_special_ed'] = noble_students['Special_Education__c'] clean_students['ever_free_lunch'] = noble_students['Low_Income__c'] clean_students['is_first_gen'] = noble_students[ 'First_Generation_College_Student__c'] # add EFC clean_students['fafsa_efc'] = noble_students.EFC_from_FAFSA__c # add all the empty columns for now missing_cols = ['family_income_bracket', 'number_in_household', 'zip'] fill_empty_cols(clean_students, missing_cols) # reorder columns return clean_students
def hs_enrollment_table(kipp_nj_enrollments): # get only high school enrollments high_school_enr = kipp_nj_enrollments[kipp_nj_enrollments.degree_type == 'High School Diploma'] #clean up statuses high_school_enr = high_school_enr[ ~high_school_enr.status.isin(['Other', 'Did Not Enroll', 'Deferred'])] high_school_enr.status = high_school_enr.status.apply( lambda x: map_value_from_dict(status_fixed_mapping, x)) # make dates into dates high_school_enr.start_date = pd.to_datetime(high_school_enr.start_date) high_school_enr.end_date = pd.to_datetime(high_school_enr.end_date) #to get high school class, get end years of graduated students high_school_enr['high_school_class'] = high_school_enr.status.combine( high_school_enr.end_date, func=extract_grad_year) high_school_enr.rename(columns={ 'student_salesforce_id': 'kipp_nj_sf_id', 'college_salesforce_id': 'kipp_nj_sf_school_id', 'status': 'exit_type' }, inplace=True) # drop irrelevant columns high_school_enr = high_school_enr.drop([ 'college_ncesid', 'data_source', 'date_last_verified', 'degree_subject', 'degree_type', 'living_on_campus', 'major', 'transfer_reason__c' ], axis=1) return high_school_enr
def contacts_table(kipp_nj_contacts): '''Cleans KIPP NJ enrollment data to match our database schema''' # correct the id column kipp_nj_contacts.rename(columns={'student_salesforce_id': 'kipp_nj_sf_id'}, inplace=True) # map the unix timestamps to dates kipp_nj_contacts.contact_date = pd.to_datetime( kipp_nj_contacts.contact_date, unit='s') #remove time kipp_nj_contacts.contact_date = kipp_nj_contacts.contact_date.apply( lambda x: x.date()) #map the contact mediums kipp_nj_contacts.contact_medium = kipp_nj_contacts.contact_medium.apply( lambda x: map_value_from_dict(medium_fixed_mapping, x)) # kipp nj doesn't keep track of mass email outreach, so was_outreach is always false kipp_nj_contacts.was_outreach = False # was_successful is coded as true or nan, replace nans with False kipp_nj_contacts.was_successful = kipp_nj_contacts.was_successful.fillna( False) # reorder columns clean_contacts = kipp_nj_contacts[[ 'kipp_nj_sf_id', 'contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful' ]] return clean_contacts
def enrollments_table(kipp_nj_enrollments): '''Cleans KIPP NJ enrollment data to match our database schema''' # remove the did not enroll, other, and deferred enrollment types clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(['Other', 'Did Not Enroll', 'Deferred'])] # rename Withdrawn to withdrew, matriculated to matriculating clean_enrollments.status = clean_enrollments.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x)) # made dates into dates clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date) clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date) clean_enrollments.date_last_verified = pd.to_datetime(clean_enrollments.date_last_verified) # deal with degree type #remove high school diploma and GED, only interested in college enrollments clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.isin(['High School Diploma', 'GED'])] clean_enrollments.degree_type = clean_enrollments.degree_type.apply(lambda x: map_value_from_dict(degree_fixed_mapping, x)) # clean up degree subject clean_enrollments.degree_subject= clean_enrollments.degree_type.combine(clean_enrollments.degree_subject, func = code_degree_subject) # clean up major major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollments['major'] = clean_enrollments.major.fillna('missing').apply(lambda x: convert_free_text(major_dict, x)) #map transfer reasons to the reasons we keep track of withdrawal_reasons = pd.get_dummies(clean_enrollments.transfer_reason__c).astype(bool) withdrawal_reasons.columns = ['withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social'] withdrawal_reasons.drop(['withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career'], axis = 1,inplace = True) # join the columns back into the original clean_enrollments = clean_enrollments.join(withdrawal_reasons) clean_enrollments['withdrawal_reason_motivational'] = np.nan clean_enrollments['withdrawal_reason_family'] = np.nan clean_enrollments['withdrawal_reason_health'] = np.nan clean_enrollments['withdrawal_reason_racial'] = np.nan # Drop invalid IPEDS id clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].convert_objects(convert_numeric=True) clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].apply(int_with_NaN_tostr) clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],axis = 1,inplace = True) clean_enrollments.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_ncesid': 'ipedsid'}, inplace=True) return clean_enrollments
def hs_enrollment_table(kipp_nj_enrollments): # get only high school enrollments high_school_enr = kipp_nj_enrollments[kipp_nj_enrollments.degree_type == 'High School Diploma'] #clean up statuses high_school_enr = high_school_enr[~high_school_enr.status.isin(['Other', 'Did Not Enroll', 'Deferred'])] high_school_enr.status = high_school_enr.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x)) # make dates into dates high_school_enr.start_date = pd.to_datetime(high_school_enr.start_date) high_school_enr.end_date = pd.to_datetime(high_school_enr.end_date) #to get high school class, get end years of graduated students high_school_enr['high_school_class']= high_school_enr.status.combine(high_school_enr.end_date, func = extract_grad_year) high_school_enr.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_salesforce_id': 'kipp_nj_sf_school_id', 'status': 'exit_type'}, inplace=True) # drop irrelevant columns high_school_enr = high_school_enr.drop(['college_ncesid','data_source', 'date_last_verified', 'degree_subject', 'degree_type', 'living_on_campus', 'major', 'transfer_reason__c'], axis = 1) return high_school_enr
def contacts_table(kipp_nj_contacts): '''Cleans KIPP NJ enrollment data to match our database schema''' # correct the id column kipp_nj_contacts.rename(columns={'student_salesforce_id':'kipp_nj_sf_id'}, inplace=True) # map the unix timestamps to dates kipp_nj_contacts.contact_date = pd.to_datetime(kipp_nj_contacts.contact_date, unit = 's') #remove time kipp_nj_contacts.contact_date = kipp_nj_contacts.contact_date.apply(lambda x: x.date()) #map the contact mediums kipp_nj_contacts.contact_medium = kipp_nj_contacts.contact_medium.apply(lambda x: map_value_from_dict(medium_fixed_mapping, x)) # kipp nj doesn't keep track of mass email outreach, so was_outreach is always false kipp_nj_contacts.was_outreach = False # was_successful is coded as true or nan, replace nans with False kipp_nj_contacts.was_successful = kipp_nj_contacts.was_successful.fillna(False) # reorder columns clean_contacts = kipp_nj_contacts[['kipp_nj_sf_id', 'contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful']] return clean_contacts
def students_table(kipp_nj_students): '''Cleans KIPP NJ student data to match our database schema''' # we don't load names into our DB clean_students = kipp_nj_students.drop('name', axis=1) #rename columns correctly clean_students.rename(columns={ 'contact_id': 'kipp_nj_sf_id', 'powerschool_id': 'kipp_nj_powerschool_id' }, inplace=True) #make DOB a date clean_students.date_of_birth = pd.to_datetime(clean_students.date_of_birth) #remap ethnicities clean_students.ethnicity = clean_students.ethnicity.apply( lambda x: map_value_from_dict(ethnicity_fixed_mapping, x)) # name of partner organization clean_students['network'] = 'KIPP_NJ' #get rid of commas in income values (confuses psycopg2) clean_students.family_income_bracket = clean_students.family_income_bracket.str.replace( ',', '') # change number in household to int clean_students.number_in_household = clean_students.dropna( ).number_in_household.astype(int) clean_students.number_in_household = clean_students.number_in_household.apply( int_with_NaN_tostr) # deal with missing 0 on zip codes clean_students.zip = clean_students.zip.dropna().astype(int).astype( str).apply(fix_zip) #drop the student with duplicated salesforce ids clean_students = clean_students.drop_duplicates( subset=['kipp_nj_powerschool_id']) #reorder columns clean_students = clean_students[[ 'kipp_nj_sf_id', 'cps_id', 'kipp_nj_powerschool_id', 'network', 'date_of_birth', 'ethnicity', 'is_female', 'ever_special_ed', 'ever_free_lunch', 'family_income_bracket', 'number_in_household', 'is_first_gen', 'zip', 'fafsa_efc' ]] #return clean df return clean_students
def contacts(contacts, alumni): #initialize clean df clean_contacts = pd.DataFrame() # add student id, initiated by student clean_contacts['noble_sf_id'] = contacts.Contact__c clean_contacts['initiated_by_student'] = contacts.Initiated_by_alum__c # convert contact date to datetime contacts.Date_of_Contact__c = pd.to_datetime(contacts.Date_of_Contact__c) # set weird dates to null clean_contacts['contact_date'] = contacts.Date_of_Contact__c.apply(wrong_dates_to_null) # clean medium to fit table constraints clean_contacts['contact_medium'] = contacts.Mode_of_Communication__c.apply(lambda x: map_value_from_dict(medium_fixed_mapping,x)) # convert communication status to successful boolean #note Noble doesn't keep track of outreach mass email, so was_outreach is always False clean_contacts['was_outreach'] = False clean_contacts['was_successful'] = contacts.Comm_Status__c.apply(code_contact_status) # clean up counselor IDs, because many contact events are batch uploaded by two people, need to fix these CreatedByIds to be the relevant counselor # merge contacts with alums to match students to the schools they went to merged_contacts = pd.merge(contacts, alumni, how = 'left', left_on = 'Contact__c', right_on = 'Id') # ids we want to overwrite are for Matt and a data manager, and some other random IDs irrelevant_ids = ['005E0000000GphFIAS', '005E00000048sScIAI'] #recode these ids as missing merged_contacts['CreatedById'] = merged_contacts['CreatedById'].apply(lambda x:'missing' if x in irrelevant_ids else x) # get the counselors for every school that aren't the irrelevant ones school_counselors = merged_contacts[merged_contacts['CreatedById'] != 'missing'].groupby(['High School', 'CreatedById']).size() # get the most common counselor for every school max_mask = school_counselors.groupby(level=0).agg('idxmax') school_counselors = school_counselors.loc[max_mask] school_counselors = school_counselors.reset_index() #get the most common counselors for every student student_counselors = merged_contacts[merged_contacts['CreatedById'] != 'missing'].groupby(['Contact__c', 'CreatedById']).size() student_max_mask = student_counselors.groupby(level=0).agg('idxmax') student_counselors = student_counselors.loc[student_max_mask] student_counselors = student_counselors.reset_index() #figure out whether there are students who have only ever been loaded by those missing IDs students_with_unique_ids_missing = merged_contacts.groupby('Contact__c').filter(lambda x: x.CreatedById.nunique() ==1) students_with_unique_ids_missing = students_with_unique_ids_missing[students_with_unique_ids_missing['CreatedById'] == 'missing'] students_with_unique_ids_missing = students_with_unique_ids_missing.groupby(['Contact__c', 'CreatedById']).size().reset_index()['Contact__c'] #save the new ids clean_contacts['counselor_id'] = merged_contacts.apply(lambda x: overwrite_ids(x,school_counselors, student_counselors, students_with_unique_ids_missing), axis =1) #reorder columns clean_contacts = clean_contacts[['noble_sf_id','contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful']] return clean_contacts
def colleges(colleges, extra_college_features): # Store CollegeID_Noble df_collegesql=pd.DataFrame(colleges['College ID']) df_collegesql.columns = ['noble_sf_college_id'] df_collegesql['ipedid'] = colleges['NCESid__c'] df_collegesql['isprivate'] = colleges['College_Type__c'].fillna('').apply(code_isprivate) df_collegesql['isforprofit'] = colleges['College_Type__c'].fillna('').apply(code_isforprofit) df_collegesql['is4year'] = colleges['College_Type__c'].fillna('').apply(code_is4year) df_collegesql['zip'] = colleges.ShippingPostalCode.str.extract('^([0-9]{5})(?:\D|$)') # Store Name df_collegesql['name'] = colleges.Name.str.replace(',','') # Store isrural df_collegesql['isrural'] = np.nan # Store allmale & allfemale df_collegesql['allmale'] = np.nan df_collegesql['allfemale'] = np.nan # Store graduation rates and transfer rates df_collegesql['graduationrate_6yr'] = colleges['6_yr_completion_rate__c'].apply(int_with_NaN_tostr) df_collegesql['graduationrate_minority_6yr'] = colleges['6_yr_minority_completion_rate__c'].apply(int_with_NaN_tostr) df_collegesql['transferrate_6yr'] = colleges['6_yr_transfer_rate__c'].apply(int_with_NaN_tostr) df_collegesql['transferrate_minority_6yr'] = colleges['6_yr_minority_transfer_rate__c'].apply(int_with_NaN_tostr) # Store historicallyblack df_collegesql['historicallyblack'] = colleges['HBCU__c'] # add in extra college features # first make new dataframe (will need to merge later) extra_college = pd.DataFrame(extra_college_features['UNITID']) extra_college['state'] = extra_college_features['STABBR'] extra_college['longitude'] = extra_college_features['Longitude'] extra_college['latitude'] = extra_college_features['Latitude'] extra_college['dist_from_chicago'] = extra_college_features['DistFromChicago'] # set missings and 2 year colleges to null extra_college['barrons_rating'] = extra_college_features.SimpleBarrons.apply(lambda x: map_value_from_dict(selectivity_mapping, x)) # convert percentages to numbers extra_college['perc_accepted'] = extra_college_features['%Apply_Accepted'].str.replace('%', '') extra_college['perc_accepted_enroll'] = extra_college_features['%Accepted_Enroll'].str.replace('%', '') extra_college['perc_male'] = extra_college_features['% male'].str.replace('%', '') extra_college['perc_female'] = extra_college_features['% female'].str.replace('%', '') extra_college['perc_african_american'] = extra_college_features['% AA'].str.replace('%', '') extra_college['perc_hispanic'] = extra_college_features['% Hispanic'].str.replace('%', '') extra_college['percentinstate'] = extra_college_features['PercentInState'] extra_college['percentoutofstate'] = extra_college_features['PercentOutOfState'] extra_college['percentpellgrant'] = extra_college_features['PercentPellGrant'] extra_college[['avgnetprice', 'netprice0_30', 'netprice30_48', 'netprice48_75']] = extra_college_features[['AvgNetPrice', 'NetPrice0-30', 'NetPrice30-48', 'NetPrice48-75']] extra_college['locale'] = extra_college_features['Locale'] extra_college['size_range'] = extra_college_features['Size Range'].str.replace(',','') # set not reported and not applicable to null extra_college['size_range'] = extra_college['size_range'].apply(lambda x: map_value_from_dict(size_range_mapping, x)) # then join on ids clean_colleges = pd.merge(df_collegesql, extra_college, how = 'left', left_on = 'ipedid', right_on = 'UNITID') clean_colleges = clean_colleges.drop(['UNITID'], axis = 1) #fix nulls for upload to sql clean_colleges.ipedid = clean_colleges.ipedid.apply(int_with_NaN_tostr) # drop two randomly duplicated colleges found through manual inspection clean_colleges = clean_colleges[~clean_colleges.noble_sf_college_id.isin(['001E000000Sg2wPIAR', '001E000000Sg2wQIAR'])] return clean_colleges
def contacts(contacts, alumni): #initialize clean df clean_contacts = pd.DataFrame() # add student id, initiated by student clean_contacts['noble_sf_id'] = contacts.Contact__c clean_contacts['initiated_by_student'] = contacts.Initiated_by_alum__c # convert contact date to datetime contacts.Date_of_Contact__c = pd.to_datetime(contacts.Date_of_Contact__c) # set weird dates to null clean_contacts['contact_date'] = contacts.Date_of_Contact__c.apply( wrong_dates_to_null) # clean medium to fit table constraints clean_contacts['contact_medium'] = contacts.Mode_of_Communication__c.apply( lambda x: map_value_from_dict(medium_fixed_mapping, x)) # convert communication status to successful boolean #note Noble doesn't keep track of outreach mass email, so was_outreach is always False clean_contacts['was_outreach'] = False clean_contacts['was_successful'] = contacts.Comm_Status__c.apply( code_contact_status) # clean up counselor IDs, because many contact events are batch uploaded by two people, need to fix these CreatedByIds to be the relevant counselor # merge contacts with alums to match students to the schools they went to merged_contacts = pd.merge(contacts, alumni, how='left', left_on='Contact__c', right_on='Id') # ids we want to overwrite are for Matt and a data manager, and some other random IDs irrelevant_ids = ['005E0000000GphFIAS', '005E00000048sScIAI'] #recode these ids as missing merged_contacts['CreatedById'] = merged_contacts['CreatedById'].apply( lambda x: 'missing' if x in irrelevant_ids else x) # get the counselors for every school that aren't the irrelevant ones school_counselors = merged_contacts[ merged_contacts['CreatedById'] != 'missing'].groupby( ['High School', 'CreatedById']).size() # get the most common counselor for every school max_mask = school_counselors.groupby(level=0).agg('idxmax') school_counselors = school_counselors.loc[max_mask] school_counselors = school_counselors.reset_index() #get the most common counselors for every student student_counselors = merged_contacts[ merged_contacts['CreatedById'] != 'missing'].groupby( ['Contact__c', 'CreatedById']).size() student_max_mask = student_counselors.groupby(level=0).agg('idxmax') student_counselors = student_counselors.loc[student_max_mask] student_counselors = student_counselors.reset_index() #figure out whether there are students who have only ever been loaded by those missing IDs students_with_unique_ids_missing = merged_contacts.groupby( 'Contact__c').filter(lambda x: x.CreatedById.nunique() == 1) students_with_unique_ids_missing = students_with_unique_ids_missing[ students_with_unique_ids_missing['CreatedById'] == 'missing'] students_with_unique_ids_missing = students_with_unique_ids_missing.groupby( ['Contact__c', 'CreatedById']).size().reset_index()['Contact__c'] #save the new ids clean_contacts['counselor_id'] = merged_contacts.apply( lambda x: overwrite_ids(x, school_counselors, student_counselors, students_with_unique_ids_missing), axis=1) #reorder columns clean_contacts = clean_contacts[[ 'noble_sf_id', 'contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful' ]] return clean_contacts
def colleges(colleges, extra_college_features): # Store CollegeID_Noble df_collegesql = pd.DataFrame(colleges['College ID']) df_collegesql.columns = ['noble_sf_college_id'] df_collegesql['ipedid'] = colleges['NCESid__c'] df_collegesql['isprivate'] = colleges['College_Type__c'].fillna('').apply( code_isprivate) df_collegesql['isforprofit'] = colleges['College_Type__c'].fillna( '').apply(code_isforprofit) df_collegesql['is4year'] = colleges['College_Type__c'].fillna('').apply( code_is4year) df_collegesql['zip'] = colleges.ShippingPostalCode.str.extract( '^([0-9]{5})(?:\D|$)') # Store Name df_collegesql['name'] = colleges.Name.str.replace(',', '') # Store isrural df_collegesql['isrural'] = np.nan # Store allmale & allfemale df_collegesql['allmale'] = np.nan df_collegesql['allfemale'] = np.nan # Store graduation rates and transfer rates df_collegesql['graduationrate_6yr'] = colleges[ '6_yr_completion_rate__c'].apply(int_with_NaN_tostr) df_collegesql['graduationrate_minority_6yr'] = colleges[ '6_yr_minority_completion_rate__c'].apply(int_with_NaN_tostr) df_collegesql['transferrate_6yr'] = colleges[ '6_yr_transfer_rate__c'].apply(int_with_NaN_tostr) df_collegesql['transferrate_minority_6yr'] = colleges[ '6_yr_minority_transfer_rate__c'].apply(int_with_NaN_tostr) # Store historicallyblack df_collegesql['historicallyblack'] = colleges['HBCU__c'] # add in extra college features # first make new dataframe (will need to merge later) extra_college = pd.DataFrame(extra_college_features['UNITID']) extra_college['state'] = extra_college_features['STABBR'] extra_college['longitude'] = extra_college_features['Longitude'] extra_college['latitude'] = extra_college_features['Latitude'] extra_college['dist_from_chicago'] = extra_college_features[ 'DistFromChicago'] # set missings and 2 year colleges to null extra_college[ 'barrons_rating'] = extra_college_features.SimpleBarrons.apply( lambda x: map_value_from_dict(selectivity_mapping, x)) # convert percentages to numbers extra_college['perc_accepted'] = extra_college_features[ '%Apply_Accepted'].str.replace('%', '') extra_college['perc_accepted_enroll'] = extra_college_features[ '%Accepted_Enroll'].str.replace('%', '') extra_college['perc_male'] = extra_college_features['% male'].str.replace( '%', '') extra_college['perc_female'] = extra_college_features[ '% female'].str.replace('%', '') extra_college['perc_african_american'] = extra_college_features[ '% AA'].str.replace('%', '') extra_college['perc_hispanic'] = extra_college_features[ '% Hispanic'].str.replace('%', '') extra_college['percentinstate'] = extra_college_features['PercentInState'] extra_college['percentoutofstate'] = extra_college_features[ 'PercentOutOfState'] extra_college['percentpellgrant'] = extra_college_features[ 'PercentPellGrant'] extra_college[[ 'avgnetprice', 'netprice0_30', 'netprice30_48', 'netprice48_75' ]] = extra_college_features[[ 'AvgNetPrice', 'NetPrice0-30', 'NetPrice30-48', 'NetPrice48-75' ]] extra_college['locale'] = extra_college_features['Locale'] extra_college['size_range'] = extra_college_features[ 'Size Range'].str.replace(',', '') # set not reported and not applicable to null extra_college['size_range'] = extra_college['size_range'].apply( lambda x: map_value_from_dict(size_range_mapping, x)) # then join on ids clean_colleges = pd.merge(df_collegesql, extra_college, how='left', left_on='ipedid', right_on='UNITID') clean_colleges = clean_colleges.drop(['UNITID'], axis=1) #fix nulls for upload to sql clean_colleges.ipedid = clean_colleges.ipedid.apply(int_with_NaN_tostr) # drop two randomly duplicated colleges found through manual inspection clean_colleges = clean_colleges[~clean_colleges.noble_sf_college_id.isin( ['001E000000Sg2wPIAR', '001E000000Sg2wQIAR'])] return clean_colleges
def enrollments_table(kipp_nj_enrollments): '''Cleans KIPP NJ enrollment data to match our database schema''' # remove the did not enroll, other, and deferred enrollment types clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin( ['Other', 'Did Not Enroll', 'Deferred'])] # rename Withdrawn to withdrew, matriculated to matriculating clean_enrollments.status = clean_enrollments.status.apply( lambda x: map_value_from_dict(status_fixed_mapping, x)) # made dates into dates clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date) clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date) clean_enrollments.date_last_verified = pd.to_datetime( clean_enrollments.date_last_verified) # deal with degree type #remove high school diploma and GED, only interested in college enrollments clean_enrollments = clean_enrollments[~clean_enrollments.degree_type. isin(['High School Diploma', 'GED'])] clean_enrollments.degree_type = clean_enrollments.degree_type.apply( lambda x: map_value_from_dict(degree_fixed_mapping, x)) # clean up degree subject clean_enrollments.degree_subject = clean_enrollments.degree_type.combine( clean_enrollments.degree_subject, func=code_degree_subject) # clean up major major_dict = create_conversion_dict( config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv') # do conversions from freetext to coded options clean_enrollments['major'] = clean_enrollments.major.fillna( 'missing').apply(lambda x: convert_free_text(major_dict, x)) #map transfer reasons to the reasons we keep track of withdrawal_reasons = pd.get_dummies( clean_enrollments.transfer_reason__c).astype(bool) withdrawal_reasons.columns = [ 'withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social' ] withdrawal_reasons.drop([ 'withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career' ], axis=1, inplace=True) # join the columns back into the original clean_enrollments = clean_enrollments.join(withdrawal_reasons) clean_enrollments['withdrawal_reason_motivational'] = np.nan clean_enrollments['withdrawal_reason_family'] = np.nan clean_enrollments['withdrawal_reason_health'] = np.nan clean_enrollments['withdrawal_reason_racial'] = np.nan # Drop invalid IPEDS id clean_enrollments['college_ncesid'] = clean_enrollments[ 'college_ncesid'].convert_objects(convert_numeric=True) clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan clean_enrollments['college_ncesid'] = clean_enrollments[ 'college_ncesid'].apply(int_with_NaN_tostr) clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'], axis=1, inplace=True) clean_enrollments.rename(columns={ 'student_salesforce_id': 'kipp_nj_sf_id', 'college_ncesid': 'ipedsid' }, inplace=True) return clean_enrollments