Python map_value_from_dict示例，util.ETL_helpers.map_value_from_dict Python示例

示例#1

0

显示文件

文件： clean_kippnj.py 项目： dssg/education-college-public

def students_table(kipp_nj_students):	

	'''Cleans KIPP NJ student data to match our database schema'''

	# we don't load names into our DB
	clean_students = kipp_nj_students.drop('name', axis =1)
	#rename columns correctly
	clean_students.rename(columns={'contact_id':'kipp_nj_sf_id', 'powerschool_id':'kipp_nj_powerschool_id'}, inplace=True)
	#make DOB a date
	clean_students.date_of_birth = pd.to_datetime(clean_students.date_of_birth)
	#remap ethnicities
	clean_students.ethnicity = clean_students.ethnicity.apply(lambda x: map_value_from_dict(ethnicity_fixed_mapping, x))
	# name of partner organization
	clean_students['network'] = 'KIPP_NJ'
	#get rid of commas in income values (confuses psycopg2)
	clean_students.family_income_bracket = clean_students.family_income_bracket.str.replace(',','')
	# change number in household to int
	clean_students.number_in_household = clean_students.dropna().number_in_household.astype(int)
	clean_students.number_in_household = clean_students.number_in_household.apply(int_with_NaN_tostr)
	# deal with missing 0 on zip codes
	clean_students.zip = clean_students.zip.dropna().astype(int).astype(str).apply(fix_zip)
	#drop the student with duplicated salesforce ids
	clean_students = clean_students.drop_duplicates(subset = ['kipp_nj_powerschool_id'])
	#reorder columns
	clean_students = clean_students[['kipp_nj_sf_id', 'cps_id', 'kipp_nj_powerschool_id', 'network', 'date_of_birth', 'ethnicity', 'is_female', 'ever_special_ed', 'ever_free_lunch', 'family_income_bracket', 'number_in_household', 'is_first_gen', 'zip', 'fafsa_efc']]
	#return clean df
	return clean_students

示例#2

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def students(noble_students):
    # initialize clean dataframe
    clean_students = pd.DataFrame()
    clean_students['noble_sf_id'] = noble_students.Id
    clean_students['cps_id'] = noble_students.Network_Student_ID__c
    clean_students['network'] = 'Noble'
    # add DOB
    noble_students['Birthdate'] = pd.to_datetime(noble_students['Birthdate'], format = '%m/%d/%Y')
    clean_students['date_of_birth'] = noble_students.Birthdate.apply(wrong_birthdates_to_null)
    # add gender as a boolean 
    clean_students['is_female'] = noble_students.Gender__c.fillna('').apply(code_gender)
    # add fixed ethnicity
    clean_students['ethnicity'] = noble_students['Ethnicity__c'].apply(lambda x: map_value_from_dict(ethnicity_fixed_mapping, x))
    
    ## add ever_special need and free lunch as booleans
    clean_students['ever_special_ed'] = noble_students['Special_Education__c']
    clean_students['ever_free_lunch'] = noble_students['Low_Income__c']
    clean_students['is_first_gen'] = noble_students['First_Generation_College_Student__c']
    # add EFC
    clean_students['fafsa_efc'] = noble_students.EFC_from_FAFSA__c
    # add all the empty columns for now
    missing_cols = ['family_income_bracket','number_in_household','zip']
    fill_empty_cols(clean_students,missing_cols)
    # reorder columns
    return clean_students

示例#3

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def students(noble_students):
    # initialize clean dataframe
    clean_students = pd.DataFrame()
    clean_students['noble_sf_id'] = noble_students.Id
    clean_students['cps_id'] = noble_students.Network_Student_ID__c
    clean_students['network'] = 'Noble'
    # add DOB
    noble_students['Birthdate'] = pd.to_datetime(noble_students['Birthdate'],
                                                 format='%m/%d/%Y')
    clean_students['date_of_birth'] = noble_students.Birthdate.apply(
        wrong_birthdates_to_null)
    # add gender as a boolean
    clean_students['is_female'] = noble_students.Gender__c.fillna('').apply(
        code_gender)
    # add fixed ethnicity
    clean_students['ethnicity'] = noble_students['Ethnicity__c'].apply(
        lambda x: map_value_from_dict(ethnicity_fixed_mapping, x))

    ## add ever_special need and free lunch as booleans
    clean_students['ever_special_ed'] = noble_students['Special_Education__c']
    clean_students['ever_free_lunch'] = noble_students['Low_Income__c']
    clean_students['is_first_gen'] = noble_students[
        'First_Generation_College_Student__c']
    # add EFC
    clean_students['fafsa_efc'] = noble_students.EFC_from_FAFSA__c
    # add all the empty columns for now
    missing_cols = ['family_income_bracket', 'number_in_household', 'zip']
    fill_empty_cols(clean_students, missing_cols)
    # reorder columns
    return clean_students

示例#4

0

显示文件

def hs_enrollment_table(kipp_nj_enrollments):
    # get only high school enrollments
    high_school_enr = kipp_nj_enrollments[kipp_nj_enrollments.degree_type ==
                                          'High School Diploma']
    #clean up statuses
    high_school_enr = high_school_enr[
        ~high_school_enr.status.isin(['Other', 'Did Not Enroll', 'Deferred'])]
    high_school_enr.status = high_school_enr.status.apply(
        lambda x: map_value_from_dict(status_fixed_mapping, x))
    # make dates into dates
    high_school_enr.start_date = pd.to_datetime(high_school_enr.start_date)
    high_school_enr.end_date = pd.to_datetime(high_school_enr.end_date)
    #to get high school class, get end years of graduated students
    high_school_enr['high_school_class'] = high_school_enr.status.combine(
        high_school_enr.end_date, func=extract_grad_year)
    high_school_enr.rename(columns={
        'student_salesforce_id': 'kipp_nj_sf_id',
        'college_salesforce_id': 'kipp_nj_sf_school_id',
        'status': 'exit_type'
    },
                           inplace=True)
    # drop irrelevant columns
    high_school_enr = high_school_enr.drop([
        'college_ncesid', 'data_source', 'date_last_verified',
        'degree_subject', 'degree_type', 'living_on_campus', 'major',
        'transfer_reason__c'
    ],
                                           axis=1)
    return high_school_enr

示例#5

0

显示文件

def contacts_table(kipp_nj_contacts):
    '''Cleans KIPP NJ enrollment data to match our database schema'''
    # correct the id column
    kipp_nj_contacts.rename(columns={'student_salesforce_id': 'kipp_nj_sf_id'},
                            inplace=True)
    # map the unix timestamps to dates
    kipp_nj_contacts.contact_date = pd.to_datetime(
        kipp_nj_contacts.contact_date, unit='s')
    #remove time
    kipp_nj_contacts.contact_date = kipp_nj_contacts.contact_date.apply(
        lambda x: x.date())
    #map the contact mediums
    kipp_nj_contacts.contact_medium = kipp_nj_contacts.contact_medium.apply(
        lambda x: map_value_from_dict(medium_fixed_mapping, x))
    # kipp nj doesn't keep track of mass email outreach, so was_outreach is always false
    kipp_nj_contacts.was_outreach = False
    # was_successful is coded as true or nan, replace nans with False
    kipp_nj_contacts.was_successful = kipp_nj_contacts.was_successful.fillna(
        False)
    # reorder columns
    clean_contacts = kipp_nj_contacts[[
        'kipp_nj_sf_id', 'contact_date', 'counselor_id', 'contact_medium',
        'initiated_by_student', 'was_outreach', 'was_successful'
    ]]
    return clean_contacts

示例#6

0

显示文件

文件： clean_kippnj.py 项目： dssg/education-college-public

def enrollments_table(kipp_nj_enrollments):

	'''Cleans KIPP NJ enrollment data to match our database schema'''

	# remove the did not enroll, other, and deferred enrollment types
	clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(['Other', 'Did Not Enroll', 'Deferred'])]
	# rename Withdrawn to withdrew, matriculated to matriculating

	clean_enrollments.status = clean_enrollments.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x))
	# made dates into dates
	clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date)
	clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date)
	clean_enrollments.date_last_verified = pd.to_datetime(clean_enrollments.date_last_verified)
	# deal with degree type
	#remove high school diploma and GED, only interested in college enrollments
	clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.isin(['High School Diploma', 'GED'])]
	clean_enrollments.degree_type = clean_enrollments.degree_type.apply(lambda x: map_value_from_dict(degree_fixed_mapping, x))
	# clean up degree subject
	clean_enrollments.degree_subject= clean_enrollments.degree_type.combine(clean_enrollments.degree_subject, func = code_degree_subject)
	# clean up major
	major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')
	# do conversions from freetext to coded options
	clean_enrollments['major'] = clean_enrollments.major.fillna('missing').apply(lambda x: convert_free_text(major_dict, x))
	#map transfer reasons to the reasons we keep track of
	withdrawal_reasons = pd.get_dummies(clean_enrollments.transfer_reason__c).astype(bool)
	withdrawal_reasons.columns = ['withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social']
	withdrawal_reasons.drop(['withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career'], axis = 1,inplace = True)
	# join the columns back into the original 
	clean_enrollments = clean_enrollments.join(withdrawal_reasons)
	clean_enrollments['withdrawal_reason_motivational'] = np.nan
	clean_enrollments['withdrawal_reason_family'] = np.nan 
	clean_enrollments['withdrawal_reason_health'] = np.nan
	clean_enrollments['withdrawal_reason_racial'] = np.nan

	# Drop invalid IPEDS id
	clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].convert_objects(convert_numeric=True)
	clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan
	clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].apply(int_with_NaN_tostr)

	clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],axis = 1,inplace = True)
	clean_enrollments.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_ncesid': 'ipedsid'}, inplace=True)	
	return clean_enrollments

示例#7

0

显示文件

文件： clean_kippnj.py 项目： dssg/education-college-public

def hs_enrollment_table(kipp_nj_enrollments):
	# get only high school enrollments
	high_school_enr = kipp_nj_enrollments[kipp_nj_enrollments.degree_type == 'High School Diploma']
	#clean up statuses
	high_school_enr = high_school_enr[~high_school_enr.status.isin(['Other', 'Did Not Enroll', 'Deferred'])]
	high_school_enr.status = high_school_enr.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x))
	# make dates into dates
	high_school_enr.start_date = pd.to_datetime(high_school_enr.start_date)
	high_school_enr.end_date = pd.to_datetime(high_school_enr.end_date)
	#to get high school class, get end years of graduated students
	high_school_enr['high_school_class']= high_school_enr.status.combine(high_school_enr.end_date, func = extract_grad_year)
	high_school_enr.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_salesforce_id': 'kipp_nj_sf_school_id', 'status': 'exit_type'}, inplace=True)
	# drop irrelevant columns
	high_school_enr = high_school_enr.drop(['college_ncesid','data_source', 'date_last_verified', 'degree_subject', 'degree_type', 'living_on_campus', 'major', 'transfer_reason__c'], axis = 1)
	return high_school_enr

示例#8

0

显示文件

文件： clean_kippnj.py 项目： dssg/education-college-public

def contacts_table(kipp_nj_contacts):
	'''Cleans KIPP NJ enrollment data to match our database schema'''
	# correct the id column
	kipp_nj_contacts.rename(columns={'student_salesforce_id':'kipp_nj_sf_id'}, inplace=True)
	# map the unix timestamps to dates
	kipp_nj_contacts.contact_date = pd.to_datetime(kipp_nj_contacts.contact_date, unit = 's')
	#remove time
	kipp_nj_contacts.contact_date = kipp_nj_contacts.contact_date.apply(lambda x: x.date())
	#map the contact mediums
	kipp_nj_contacts.contact_medium = kipp_nj_contacts.contact_medium.apply(lambda x: map_value_from_dict(medium_fixed_mapping, x))
	# kipp nj doesn't keep track of mass email outreach, so was_outreach is always false
	kipp_nj_contacts.was_outreach = False 
	# was_successful is coded as true or nan, replace nans with False
	kipp_nj_contacts.was_successful = kipp_nj_contacts.was_successful.fillna(False)
	# reorder columns
	clean_contacts = kipp_nj_contacts[['kipp_nj_sf_id', 'contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful']]
	return clean_contacts

示例#9

0

显示文件

def students_table(kipp_nj_students):
    '''Cleans KIPP NJ student data to match our database schema'''

    # we don't load names into our DB
    clean_students = kipp_nj_students.drop('name', axis=1)
    #rename columns correctly
    clean_students.rename(columns={
        'contact_id': 'kipp_nj_sf_id',
        'powerschool_id': 'kipp_nj_powerschool_id'
    },
                          inplace=True)
    #make DOB a date
    clean_students.date_of_birth = pd.to_datetime(clean_students.date_of_birth)
    #remap ethnicities
    clean_students.ethnicity = clean_students.ethnicity.apply(
        lambda x: map_value_from_dict(ethnicity_fixed_mapping, x))
    # name of partner organization
    clean_students['network'] = 'KIPP_NJ'
    #get rid of commas in income values (confuses psycopg2)
    clean_students.family_income_bracket = clean_students.family_income_bracket.str.replace(
        ',', '')
    # change number in household to int
    clean_students.number_in_household = clean_students.dropna(
    ).number_in_household.astype(int)
    clean_students.number_in_household = clean_students.number_in_household.apply(
        int_with_NaN_tostr)
    # deal with missing 0 on zip codes
    clean_students.zip = clean_students.zip.dropna().astype(int).astype(
        str).apply(fix_zip)
    #drop the student with duplicated salesforce ids
    clean_students = clean_students.drop_duplicates(
        subset=['kipp_nj_powerschool_id'])
    #reorder columns
    clean_students = clean_students[[
        'kipp_nj_sf_id', 'cps_id', 'kipp_nj_powerschool_id', 'network',
        'date_of_birth', 'ethnicity', 'is_female', 'ever_special_ed',
        'ever_free_lunch', 'family_income_bracket', 'number_in_household',
        'is_first_gen', 'zip', 'fafsa_efc'
    ]]
    #return clean df
    return clean_students

示例#10

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def contacts(contacts, alumni):
    #initialize clean df
    clean_contacts = pd.DataFrame()
    # add student id, initiated by student
    clean_contacts['noble_sf_id'] = contacts.Contact__c
    clean_contacts['initiated_by_student'] = contacts.Initiated_by_alum__c
    # convert contact date to datetime
    contacts.Date_of_Contact__c = pd.to_datetime(contacts.Date_of_Contact__c)
    # set weird dates to null
    clean_contacts['contact_date'] = contacts.Date_of_Contact__c.apply(wrong_dates_to_null)
    # clean medium to fit table constraints
    clean_contacts['contact_medium'] = contacts.Mode_of_Communication__c.apply(lambda x: map_value_from_dict(medium_fixed_mapping,x))
    # convert communication status to successful boolean
    #note Noble doesn't keep track of outreach mass email, so was_outreach is always False
    clean_contacts['was_outreach'] = False
    clean_contacts['was_successful'] = contacts.Comm_Status__c.apply(code_contact_status)
    # clean up counselor IDs, because many contact events are batch uploaded by two people, need to fix these CreatedByIds to be the relevant counselor
    # merge contacts with alums to match students to the schools they went to
    merged_contacts = pd.merge(contacts, alumni, how = 'left', left_on = 'Contact__c', right_on = 'Id')
    # ids we want to overwrite are for Matt and a data manager, and some other random IDs
    irrelevant_ids = ['005E0000000GphFIAS', '005E00000048sScIAI']
    #recode these ids as missing
    merged_contacts['CreatedById'] = merged_contacts['CreatedById'].apply(lambda x:'missing' if x in irrelevant_ids else x)
    # get the counselors for every school that aren't the irrelevant ones
    school_counselors = merged_contacts[merged_contacts['CreatedById'] != 'missing'].groupby(['High School', 'CreatedById']).size()
    # get the most common counselor for every school
    max_mask = school_counselors.groupby(level=0).agg('idxmax')
    school_counselors = school_counselors.loc[max_mask]
    school_counselors = school_counselors.reset_index()
    #get the most common counselors for every student
    student_counselors = merged_contacts[merged_contacts['CreatedById'] != 'missing'].groupby(['Contact__c', 'CreatedById']).size()
    student_max_mask = student_counselors.groupby(level=0).agg('idxmax')
    student_counselors = student_counselors.loc[student_max_mask]
    student_counselors = student_counselors.reset_index()
    #figure out whether there are students who have only ever been loaded by those missing IDs
    students_with_unique_ids_missing  = merged_contacts.groupby('Contact__c').filter(lambda x: x.CreatedById.nunique() ==1)
    students_with_unique_ids_missing = students_with_unique_ids_missing[students_with_unique_ids_missing['CreatedById'] == 'missing']
    students_with_unique_ids_missing = students_with_unique_ids_missing.groupby(['Contact__c', 'CreatedById']).size().reset_index()['Contact__c']
    #save the new ids
    clean_contacts['counselor_id'] = merged_contacts.apply(lambda x: overwrite_ids(x,school_counselors, student_counselors, students_with_unique_ids_missing), axis =1)
    #reorder columns 
    clean_contacts = clean_contacts[['noble_sf_id','contact_date', 'counselor_id', 'contact_medium', 'initiated_by_student', 'was_outreach', 'was_successful']]
    return clean_contacts

示例#11

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def colleges(colleges, extra_college_features):

    # Store CollegeID_Noble
    df_collegesql=pd.DataFrame(colleges['College ID'])
    df_collegesql.columns = ['noble_sf_college_id']
    df_collegesql['ipedid'] = colleges['NCESid__c']
    df_collegesql['isprivate'] = colleges['College_Type__c'].fillna('').apply(code_isprivate)
    df_collegesql['isforprofit'] = colleges['College_Type__c'].fillna('').apply(code_isforprofit)
    df_collegesql['is4year'] = colleges['College_Type__c'].fillna('').apply(code_is4year)
    df_collegesql['zip'] = colleges.ShippingPostalCode.str.extract('^([0-9]{5})(?:\D|$)')
    # Store Name
    df_collegesql['name'] = colleges.Name.str.replace(',','')

    # Store isrural
    df_collegesql['isrural'] = np.nan

    # Store allmale & allfemale
    df_collegesql['allmale'] = np.nan
    df_collegesql['allfemale'] = np.nan

    # Store graduation rates and transfer rates
    df_collegesql['graduationrate_6yr'] = colleges['6_yr_completion_rate__c'].apply(int_with_NaN_tostr)
    df_collegesql['graduationrate_minority_6yr'] = colleges['6_yr_minority_completion_rate__c'].apply(int_with_NaN_tostr)

    df_collegesql['transferrate_6yr'] = colleges['6_yr_transfer_rate__c'].apply(int_with_NaN_tostr)
    df_collegesql['transferrate_minority_6yr'] = colleges['6_yr_minority_transfer_rate__c'].apply(int_with_NaN_tostr)

    # Store historicallyblack
    df_collegesql['historicallyblack'] = colleges['HBCU__c']

    # add in extra college features
    # first make new dataframe (will need to merge later)

    extra_college = pd.DataFrame(extra_college_features['UNITID'])
    extra_college['state'] = extra_college_features['STABBR']
    extra_college['longitude'] = extra_college_features['Longitude']
    extra_college['latitude'] = extra_college_features['Latitude']
    extra_college['dist_from_chicago'] = extra_college_features['DistFromChicago']
    # set missings and 2 year colleges to null
    extra_college['barrons_rating'] = extra_college_features.SimpleBarrons.apply(lambda x: map_value_from_dict(selectivity_mapping, x))
    # convert percentages to numbers
    extra_college['perc_accepted'] = extra_college_features['%Apply_Accepted'].str.replace('%', '')
    extra_college['perc_accepted_enroll'] = extra_college_features['%Accepted_Enroll'].str.replace('%', '')
    extra_college['perc_male'] = extra_college_features['% male'].str.replace('%', '')
    extra_college['perc_female'] = extra_college_features['% female'].str.replace('%', '')
    extra_college['perc_african_american'] = extra_college_features['% AA'].str.replace('%', '')
    extra_college['perc_hispanic'] = extra_college_features['% Hispanic'].str.replace('%', '')
    extra_college['percentinstate'] = extra_college_features['PercentInState']
    extra_college['percentoutofstate'] = extra_college_features['PercentOutOfState']
    extra_college['percentpellgrant'] = extra_college_features['PercentPellGrant']
    extra_college[['avgnetprice', 'netprice0_30', 'netprice30_48', 'netprice48_75']] = extra_college_features[['AvgNetPrice', 'NetPrice0-30', 'NetPrice30-48', 'NetPrice48-75']]
    extra_college['locale'] = extra_college_features['Locale']
    extra_college['size_range'] = extra_college_features['Size Range'].str.replace(',','')
    # set not reported and not applicable to null
    extra_college['size_range'] = extra_college['size_range'].apply(lambda x: map_value_from_dict(size_range_mapping, x))

    # then join on ids

    clean_colleges = pd.merge(df_collegesql, extra_college, how = 'left', left_on = 'ipedid', right_on = 'UNITID')
    clean_colleges = clean_colleges.drop(['UNITID'], axis = 1)
    #fix nulls for upload to sql
    clean_colleges.ipedid = clean_colleges.ipedid.apply(int_with_NaN_tostr)
    # drop two randomly duplicated colleges found through manual inspection
    clean_colleges = clean_colleges[~clean_colleges.noble_sf_college_id.isin(['001E000000Sg2wPIAR', '001E000000Sg2wQIAR'])]
    return clean_colleges

示例#12

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def contacts(contacts, alumni):
    #initialize clean df
    clean_contacts = pd.DataFrame()
    # add student id, initiated by student
    clean_contacts['noble_sf_id'] = contacts.Contact__c
    clean_contacts['initiated_by_student'] = contacts.Initiated_by_alum__c
    # convert contact date to datetime
    contacts.Date_of_Contact__c = pd.to_datetime(contacts.Date_of_Contact__c)
    # set weird dates to null
    clean_contacts['contact_date'] = contacts.Date_of_Contact__c.apply(
        wrong_dates_to_null)
    # clean medium to fit table constraints
    clean_contacts['contact_medium'] = contacts.Mode_of_Communication__c.apply(
        lambda x: map_value_from_dict(medium_fixed_mapping, x))
    # convert communication status to successful boolean
    #note Noble doesn't keep track of outreach mass email, so was_outreach is always False
    clean_contacts['was_outreach'] = False
    clean_contacts['was_successful'] = contacts.Comm_Status__c.apply(
        code_contact_status)
    # clean up counselor IDs, because many contact events are batch uploaded by two people, need to fix these CreatedByIds to be the relevant counselor
    # merge contacts with alums to match students to the schools they went to
    merged_contacts = pd.merge(contacts,
                               alumni,
                               how='left',
                               left_on='Contact__c',
                               right_on='Id')
    # ids we want to overwrite are for Matt and a data manager, and some other random IDs
    irrelevant_ids = ['005E0000000GphFIAS', '005E00000048sScIAI']
    #recode these ids as missing
    merged_contacts['CreatedById'] = merged_contacts['CreatedById'].apply(
        lambda x: 'missing' if x in irrelevant_ids else x)
    # get the counselors for every school that aren't the irrelevant ones
    school_counselors = merged_contacts[
        merged_contacts['CreatedById'] != 'missing'].groupby(
            ['High School', 'CreatedById']).size()
    # get the most common counselor for every school
    max_mask = school_counselors.groupby(level=0).agg('idxmax')
    school_counselors = school_counselors.loc[max_mask]
    school_counselors = school_counselors.reset_index()
    #get the most common counselors for every student
    student_counselors = merged_contacts[
        merged_contacts['CreatedById'] != 'missing'].groupby(
            ['Contact__c', 'CreatedById']).size()
    student_max_mask = student_counselors.groupby(level=0).agg('idxmax')
    student_counselors = student_counselors.loc[student_max_mask]
    student_counselors = student_counselors.reset_index()
    #figure out whether there are students who have only ever been loaded by those missing IDs
    students_with_unique_ids_missing = merged_contacts.groupby(
        'Contact__c').filter(lambda x: x.CreatedById.nunique() == 1)
    students_with_unique_ids_missing = students_with_unique_ids_missing[
        students_with_unique_ids_missing['CreatedById'] == 'missing']
    students_with_unique_ids_missing = students_with_unique_ids_missing.groupby(
        ['Contact__c', 'CreatedById']).size().reset_index()['Contact__c']
    #save the new ids
    clean_contacts['counselor_id'] = merged_contacts.apply(
        lambda x: overwrite_ids(x, school_counselors, student_counselors,
                                students_with_unique_ids_missing),
        axis=1)
    #reorder columns
    clean_contacts = clean_contacts[[
        'noble_sf_id', 'contact_date', 'counselor_id', 'contact_medium',
        'initiated_by_student', 'was_outreach', 'was_successful'
    ]]
    return clean_contacts

示例#13

0

显示文件

文件： clean_noble.py 项目： dssg/education-college-public

def colleges(colleges, extra_college_features):

    # Store CollegeID_Noble
    df_collegesql = pd.DataFrame(colleges['College ID'])
    df_collegesql.columns = ['noble_sf_college_id']
    df_collegesql['ipedid'] = colleges['NCESid__c']
    df_collegesql['isprivate'] = colleges['College_Type__c'].fillna('').apply(
        code_isprivate)
    df_collegesql['isforprofit'] = colleges['College_Type__c'].fillna(
        '').apply(code_isforprofit)
    df_collegesql['is4year'] = colleges['College_Type__c'].fillna('').apply(
        code_is4year)
    df_collegesql['zip'] = colleges.ShippingPostalCode.str.extract(
        '^([0-9]{5})(?:\D|$)')
    # Store Name
    df_collegesql['name'] = colleges.Name.str.replace(',', '')

    # Store isrural
    df_collegesql['isrural'] = np.nan

    # Store allmale & allfemale
    df_collegesql['allmale'] = np.nan
    df_collegesql['allfemale'] = np.nan

    # Store graduation rates and transfer rates
    df_collegesql['graduationrate_6yr'] = colleges[
        '6_yr_completion_rate__c'].apply(int_with_NaN_tostr)
    df_collegesql['graduationrate_minority_6yr'] = colleges[
        '6_yr_minority_completion_rate__c'].apply(int_with_NaN_tostr)

    df_collegesql['transferrate_6yr'] = colleges[
        '6_yr_transfer_rate__c'].apply(int_with_NaN_tostr)
    df_collegesql['transferrate_minority_6yr'] = colleges[
        '6_yr_minority_transfer_rate__c'].apply(int_with_NaN_tostr)

    # Store historicallyblack
    df_collegesql['historicallyblack'] = colleges['HBCU__c']

    # add in extra college features
    # first make new dataframe (will need to merge later)

    extra_college = pd.DataFrame(extra_college_features['UNITID'])
    extra_college['state'] = extra_college_features['STABBR']
    extra_college['longitude'] = extra_college_features['Longitude']
    extra_college['latitude'] = extra_college_features['Latitude']
    extra_college['dist_from_chicago'] = extra_college_features[
        'DistFromChicago']
    # set missings and 2 year colleges to null
    extra_college[
        'barrons_rating'] = extra_college_features.SimpleBarrons.apply(
            lambda x: map_value_from_dict(selectivity_mapping, x))
    # convert percentages to numbers
    extra_college['perc_accepted'] = extra_college_features[
        '%Apply_Accepted'].str.replace('%', '')
    extra_college['perc_accepted_enroll'] = extra_college_features[
        '%Accepted_Enroll'].str.replace('%', '')
    extra_college['perc_male'] = extra_college_features['% male'].str.replace(
        '%', '')
    extra_college['perc_female'] = extra_college_features[
        '% female'].str.replace('%', '')
    extra_college['perc_african_american'] = extra_college_features[
        '% AA'].str.replace('%', '')
    extra_college['perc_hispanic'] = extra_college_features[
        '% Hispanic'].str.replace('%', '')
    extra_college['percentinstate'] = extra_college_features['PercentInState']
    extra_college['percentoutofstate'] = extra_college_features[
        'PercentOutOfState']
    extra_college['percentpellgrant'] = extra_college_features[
        'PercentPellGrant']
    extra_college[[
        'avgnetprice', 'netprice0_30', 'netprice30_48', 'netprice48_75'
    ]] = extra_college_features[[
        'AvgNetPrice', 'NetPrice0-30', 'NetPrice30-48', 'NetPrice48-75'
    ]]
    extra_college['locale'] = extra_college_features['Locale']
    extra_college['size_range'] = extra_college_features[
        'Size Range'].str.replace(',', '')
    # set not reported and not applicable to null
    extra_college['size_range'] = extra_college['size_range'].apply(
        lambda x: map_value_from_dict(size_range_mapping, x))

    # then join on ids

    clean_colleges = pd.merge(df_collegesql,
                              extra_college,
                              how='left',
                              left_on='ipedid',
                              right_on='UNITID')
    clean_colleges = clean_colleges.drop(['UNITID'], axis=1)
    #fix nulls for upload to sql
    clean_colleges.ipedid = clean_colleges.ipedid.apply(int_with_NaN_tostr)
    # drop two randomly duplicated colleges found through manual inspection
    clean_colleges = clean_colleges[~clean_colleges.noble_sf_college_id.isin(
        ['001E000000Sg2wPIAR', '001E000000Sg2wQIAR'])]
    return clean_colleges

示例#14

0

显示文件

def enrollments_table(kipp_nj_enrollments):
    '''Cleans KIPP NJ enrollment data to match our database schema'''

    # remove the did not enroll, other, and deferred enrollment types
    clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(
        ['Other', 'Did Not Enroll', 'Deferred'])]
    # rename Withdrawn to withdrew, matriculated to matriculating

    clean_enrollments.status = clean_enrollments.status.apply(
        lambda x: map_value_from_dict(status_fixed_mapping, x))
    # made dates into dates
    clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date)
    clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date)
    clean_enrollments.date_last_verified = pd.to_datetime(
        clean_enrollments.date_last_verified)
    # deal with degree type
    #remove high school diploma and GED, only interested in college enrollments
    clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.
                                          isin(['High School Diploma', 'GED'])]
    clean_enrollments.degree_type = clean_enrollments.degree_type.apply(
        lambda x: map_value_from_dict(degree_fixed_mapping, x))
    # clean up degree subject
    clean_enrollments.degree_subject = clean_enrollments.degree_type.combine(
        clean_enrollments.degree_subject, func=code_degree_subject)
    # clean up major
    major_dict = create_conversion_dict(
        config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')
    # do conversions from freetext to coded options
    clean_enrollments['major'] = clean_enrollments.major.fillna(
        'missing').apply(lambda x: convert_free_text(major_dict, x))
    #map transfer reasons to the reasons we keep track of
    withdrawal_reasons = pd.get_dummies(
        clean_enrollments.transfer_reason__c).astype(bool)
    withdrawal_reasons.columns = [
        'withdrawal_reason_academic', 'withdrawal_reason_career',
        'withdrawal_reason_financial', 'withdrawal_reason_other',
        'withdrawal_reason_placement', 'withdrawal_reason_relocation',
        'withdrawal_reason_social'
    ]
    withdrawal_reasons.drop([
        'withdrawal_reason_relocation', 'withdrawal_reason_placement',
        'withdrawal_reason_other', 'withdrawal_reason_career'
    ],
                            axis=1,
                            inplace=True)
    # join the columns back into the original
    clean_enrollments = clean_enrollments.join(withdrawal_reasons)
    clean_enrollments['withdrawal_reason_motivational'] = np.nan
    clean_enrollments['withdrawal_reason_family'] = np.nan
    clean_enrollments['withdrawal_reason_health'] = np.nan
    clean_enrollments['withdrawal_reason_racial'] = np.nan

    # Drop invalid IPEDS id
    clean_enrollments['college_ncesid'] = clean_enrollments[
        'college_ncesid'].convert_objects(convert_numeric=True)
    clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999,
                          'college_ncesid'] = np.nan
    clean_enrollments['college_ncesid'] = clean_enrollments[
        'college_ncesid'].apply(int_with_NaN_tostr)

    clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],
                           axis=1,
                           inplace=True)
    clean_enrollments.rename(columns={
        'student_salesforce_id': 'kipp_nj_sf_id',
        'college_ncesid': 'ipedsid'
    },
                             inplace=True)
    return clean_enrollments