def deal_with_dummies(df, cohort): if isinstance(df, str): df = ml.read_data(df) ################################### ## CREATE DUMMY VARIABLE COLUMNS ## ################################### print "Creating dummy variables..." school_ids = [col for col in df.columns if 'school_id' in col] df[school_ids] = df.loc[:,school_ids].astype(str, copy=False) string_cols = list(df.select_dtypes(include=['object'])) dummys = pd.get_dummies(df[string_cols], dummy_na=True) df = pd.concat([df, dummys], axis=1) df.drop(string_cols, axis=1, inplace=True) ## Save clean version return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str(cohort) + '.csv' ml.print_to_csv(df, return_file) return df
def impute_data(df, cohort): #import IPython #IPython.embed() if isinstance(df, str): df = ml.read_data(df) ######################### ## IMPUTE MISSING DATA ## ######################### print "Imputing missing data..." #change msam to missing is msam_NA==1 nanList = ['g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan'] varList = [[ 'g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'], ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'], ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'],['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient']] for x in range(0,len(nanList)): nacol = nanList[x] colList = varList[x] for col in colList: df.loc[df[nacol] == 1, col] = np.nan #pred missing data using any available data wordList = ['absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient', 'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained'] for word in wordList: colList = [col for col in df.columns if word in col] rowMean = df[colList].mean(axis=1) for col in colList: print df[col].value_counts(dropna=False) df.loc[:,col].fillna(rowMean, inplace=True) print df[col].value_counts(dropna=False) ''' ############################ # IMPUTE NEIGHBORHOOD DATA # ############################ print "Imputing missing school neighborhood data..." ## Fill missing school neighborhood data print "Fixing neighborhood columns..." neighborhood_cols = ['suspensionrate', 'mobilityrateentrantswithdra', 'attendancerate', 'avg_class_size', 'studentinstructionalstaffratio', 'dropoutrate', 'grade12documenteddecisionco', 'grade12documenteddecisionem', 'grade12documenteddecisionmi', 'grad12docdec_col_emp', 'graduationrate', 'studentsmeetinguniversitysyste', 'Est_Households_2012', 'Est_Population_2012', 'Med_Household_Income_2012', 'Mean_Household_Income_2012', 'Pop_Below_Poverty_2012', 'Percent_Below_Poverty_2012', 'Pop_Under18_2012', 'Under18_Below_Poverty_2012', 'Under18_Below_Poverty_Percent_2012', 'Housholds_on_Food_stamps_with_Children_Under18_2012', 'Housholds_Pop_on_Food_Stamps_2012', 'Pop_BlackAA_2012', 'Pop_White_2012', 'Bt_18_24_percent_less_than_High_School_2012', 'Bt_18_24_percent_High_School_2012', 'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012', 'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012', 'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012', 'Over_25_percent_AA_2012', 'Over_25_percent_Bachelors_2012', 'Over_25_percent_Graduate_or_Professionals_2012'] ml.replace_with_mean(df, neighborhood_cols) ''' #summary = ml.summarize(df) #print summary.T #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv') return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str(cohort) + '.csv' ml.print_to_csv(df, return_file) #IPython.embed() print "Done!" import IPython IPython.embed() return df
def choose_columns(df, grade): print "Choosing data..." if isinstance(df, str): df = ml.read_data(df) #Find columns to use print "Choosing columns..." all_columns = list(df.columns.values) cols_to_use = [] i = grade prefixes = [] while i <= 12: prefixes.append('g' + str(i)) i+=1 for col in all_columns: for p in prefixes: if not col.startswith(p): if col not in cols_to_use: cols_to_use.append(col) for index, val in enumerate(cols_to_use): if val.startswith('Unnamed'): cols_to_use.pop(index) y = 'g' + str(grade) + '_dropout' return cols_to_use, y
def choose_columns(df, grade): print "Choosing data..." if isinstance(df, str): df = ml.read_data(df) #Find columns to use print "Choosing columns..." all_columns = list(df.columns.values) cols_to_use = [] i = grade prefixes = [] while i <= 12: prefixes.append('g' + str(i)) i += 1 for col in all_columns: for p in prefixes: if not col.startswith(p): if col not in cols_to_use: cols_to_use.append(col) for index, val in enumerate(cols_to_use): if val.startswith('Unnamed'): cols_to_use.pop(index) y = 'g' + str(grade) + '_dropout' return cols_to_use, y
def deal_with_dummies(df, cohort): if isinstance(df, str): df = ml.read_data(df) ################################### ## CREATE DUMMY VARIABLE COLUMNS ## ################################### print "Creating dummy variables..." school_ids = [col for col in df.columns if 'school_id' in col] df[school_ids] = df.loc[:, school_ids].astype(str, copy=False) string_cols = list(df.select_dtypes(include=['object'])) dummys = pd.get_dummies(df[string_cols], dummy_na=True) df = pd.concat([df, dummys], axis=1) df.drop(string_cols, axis=1, inplace=True) ## Save clean version return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str( cohort) + '.csv' ml.print_to_csv(df, return_file) return df
def deal_with_dummies(dataset): df = ml.read_data(dataset) ################################### ## CREATE DUMMY VARIABLE COLUMNS ## ################################### print "Creating dummy variables..." string_cols = list(df.select_dtypes(include=['object'])) print string_cols df = ml.get_dummys(df, string_cols, dummy_na=True) for col in string_cols: print col df.drop(col, axis=1, inplace=True) ## Save clean version ml.print_to_csv(df, 'data/clean_data.csv')
def run_classifiers(csv_file, y): ## LOAD PREPARED DATA df = ml.read_data(csv_file) ################################ # Build & Evaluate Classifiers # ################################ print "Evaluating classifiers..." ## USE TOP FEATURES TO COMPARE CLASSIFIER PERFORMACE features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate', 'NumberOfOpenCreditLinesAndLoans'] X = df[features].as_matrix() y = df[y].as_matrix() #print ml.build_classifiers(X,y) ml.print_to_csv(ml.build_classifiers(X, y), 'compare_classifiers.csv')
def summarize_data(dataset): ############### ## LOAD DATA ## ############### print "Loading data..." df = ml.read_data(dataset) variables = list(df.columns.values) #print variables #################################### ## RUN INITIAL SUMMARY STATISTICS ## #################################### print "Running summary statistics..." ml.summarize_dataset(dataset) for v in variables: ml.summary_statistics(v, dataset, 5, 10) return df
''' CAPP30254 HW5 Xuan Bu Run Pipeline ''' import pipeline as pl import classifiers as clf import evaluation as el import temporal_validation as tv import pandas as pd ### Step 1: Read Data df = pl.read_data('projects_2012_2013.csv') ### Step 2: Explore Data continuous_vars = [ 'total_price_including_optional_support', 'students_reached' ] categorical_vars = ['teacher_prefix', 'school_metro', 'school_charter',\ 'school_magnet', 'primary_focus_subject', 'primary_focus_area',\ 'secondary_focus_subject', 'secondary_focus_area',\ 'resource_type', 'poverty_level', 'grade_level',\ 'eligible_double_your_impact_match'] pl.summary_continuous_vars(df, continuous_vars) for cat in categorical_vars: print(pl.summary_categorical_vars(df, cat)) pl.generate_graph(df, continuous_vars) pl.generate_corr_graph(df) outliers = pl.count_outliers(df, continuous_vars)
# summarize_data(df) # summarize_data(test) ## CLEAN DATA # print "Cleaning Cohort 1..." # predummy_cohort1 = clean_data(df, 1) # print "Cleaning Cohort 2..." # predummy_cohort2 = clean_data(test, 2) # clean_cohort1 = deal_with_dummies(predummy_cohort1, 1) # clean_cohort2 = deal_with_dummies(predummy_cohort2, 2) ## TRAINING DATA: CHOOSE SUBSET clean_cohort1 = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv' df = ml.read_data(clean_cohort1) cols_to_use, y = choose_columns(df, 12) rows = choose_rows(df, 12) # X = df[cols_to_use] # y = df[y] #import IPython #IPython.embed() ## TRAINING DATA: IMPUTATION # subset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv' X = impute_data(rows[cols_to_use], 1) #IPython.embed() ## TRAINING DATA: START K-FOLD WITH CORRECT DATA # imputed_dataset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv' # df = ml.read_data(imputed_dataset)
import pandas as pd import numpy as np import matplotlib import statsmodels.api as sm import os from matplotlib import pyplot as plt import seaborn as sns import pickle import pipeline as pipe if __name__ == '__main__': train = pipe.read_data('cs-training.csv') train = pipe.fill_missing(train) original_features = train.columns[1:] response_var = train.columns[0] heavy_tail_club = ['revolving_utilization_of_unsecured_lines', 'debt_ratio','monthly_income', 'number_of_time30-59_days_past_due_not_worse'] heavy_tail_club_cutoffs = [99,90,90,90] train = pipe.trim_tails(train,heavy_tail_club,heavy_tail_club_cutoffs) pipe.explore_data(train[heavy_tail_club]) train = pipe.replace_value(train,['age'],0,np.median(train['age'])) train = pipe.robust_scale_data(train,original_features)
def impute_data(dataset, cohort): df = ml.read_data(dataset) ########################## ## IMPUTE ACADEMIC DATA ## ########################## print "Impute missing academic information..." ## Fill missing school data -- use mean imputation for now school_vars = ['g6_school_id', 'g7_school_id', 'g8_school_id', 'g9_school_id', 'g10_school_id', 'g11_school_id', 'g12_school_id'] ml.replace_with_mean(df, school_vars) ## Fill missing grade and test score information -- use mean imputation for now grades_tests = ['g6_q1mpa', 'g6_q2mpa', 'g6_q3mpa', 'g6_q4mpa', 'g6_g6mapr','g7_q1mpa', 'g7_q2mpa', 'g7_q3mpa', 'g7_q4mpa', 'g7_g7mapr', 'g8_q1mpa', 'g8_q2mpa', 'g8_q3mpa', 'g8_q4mpa', 'g8_g8mapr', 'g9_q1mpa', 'g9_q2mpa', 'g9_q3mpa', 'g9_q4mpa', 'g9_g8mapr', 'g10_q1mpa', 'g10_q2mpa', 'g10_q3mpa', 'g10_q4mpa', 'g10_psatv', 'g10_psatm', 'g11_q1mpa', 'g11_q2mpa', 'g11_q3mpa', 'g11_q4mpa', 'g11_psatv', 'g11_psatm', 'g12_q1mpa', 'g12_q2mpa', 'g12_q3mpa', 'g12_q4mpa', 'g12_psatv', 'g12_psatm'] ml.replace_with_mean(df, grades_tests) ## Fill in missing id with dummy ml.replace_with_value(df, 'id', 0) ## Fill missing MSAM data g6_msam = ['g6_g6msam_Advanced','g6_g6msam_Basic','g6_g6msam_Proficient'] ml.replace_dummy_null_mean(df, 'g6_g6msam_nan', g6_msam) if cohort == 'cohort1': g7_msam = ['g7_g7msam_Advanced','g7_g7msam_Basic','g7_g7msam_Proficient'] ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam) elif cohort == 'cohort2': g7_msam = ['g7_g7msam_ ','g7_g7msam_1','g7_g7msam_2', 'g7_g7msam_3'] ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam) g8_msam = ['g8_g8msam_Advanced','g8_g8msam_Basic','g8_g8msam_Proficient'] ml.replace_dummy_null_mean(df, 'g8_g8msam_nan', g8_msam) g9_msam = ['g9_g8msam_Advanced','g9_g8msam_Basic','g9_g8msam_Proficient'] ml.replace_dummy_null_mean(df,'g9_g8msam_nan', g9_msam) ############################ ## IMPUTE BEHAVIORAL DATA ## ############################ print "Impute missing behavioral data..." ## Fill missing behavioral data -- use mean imputation for now behavioral_cols = ['g6_absrate', 'g6_nsusp','g7_absrate', 'g7_tardyr', 'g7_nsusp', 'g8_absrate', 'g8_tardyr', 'g8_nsusp', 'g9_absrate', 'g9_nsusp', 'g10_absrate', 'g10_nsusp', 'g11_absrate', 'g11_nsusp','g12_absrate', 'g12_nsusp'] ml.replace_with_mean(df, behavioral_cols) ## Fill in missing birthday data #ml.replace_with_mean(df, 'birthday') ############################ ## IMPUTE ENROLLMENT DATA ## ############################ print "Imputing missing enrollment data..." ## Fill missing enrollment data print "Fixing mobility columns..." mobility_cols = ['g10_retained', 'g6_mobility', 'g7_mobility', 'g8_mobility', 'g9_mobility', 'g9_retained','g10_mobility', 'g11_mobility', 'g12_mobility', 'birthday'] # Includes g10_retained because it's coded as 0/1 already ml.replace_with_mean(df, mobility_cols) ######################### ## IMPUTE DROPOUT DATA ## ######################### print "Impute missing droput information..." ## Fill missing dropout information with 0 dropout_vars = ['g6_dropout', 'g7_dropout', 'g8_dropout', 'g9_dropout', 'g10_dropout', 'g11_dropout', 'g12_dropout', 'dropout'] ml.replace_with_value(df, dropout_vars, [0,0,0,0,0,0,0,0]) #variables = list(df.columns.values) #print variables ############################ # IMPUTE NEIGHBORHOOD DATA # ############################ print "Imputing missing school neighborhood data..." ## Fill missing school neighborhood data print "Fixing neighborhood columns..." """ neighborhood_cols = ['suspensionrate', 'mobilityrateentrantswithdra', 'attendancerate', 'avg_class_size', 'studentinstructionalstaffratio', 'dropoutrate', 'grade12documenteddecisionco', 'grade12documenteddecisionem', 'grade12documenteddecisionmi', 'grad12docdec_col_emp', 'graduationrate', 'studentsmeetinguniversitysyste', 'Est_Households_2012', 'Est_Population_2012', 'Med_Household_Income_2012', 'Mean_Household_Income_2012', 'Pop_Below_Poverty_2012', 'Percent_Below_Poverty_2012', 'Pop_Under18_2012', 'Under18_Below_Poverty_2012', 'Under18_Below_Poverty_Percent_2012', 'Housholds_on_Food_stamps_with_Children_Under18_2012', 'Housholds_Pop_on_Food_Stamps_2012', 'Pop_BlackAA_2012', 'Pop_White_2012', 'Bt_18_24_percent_less_than_High_School_2012', 'Bt_18_24_percent_High_School_2012', 'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012', 'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012', 'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012', 'Over_25_percent_AA_2012', 'Over_25_percent_Bachelors_2012', 'Over_25_percent_Graduate_or_Professionals_2012'] """ neighborhood_cols = ['g9_suspensionrate', 'g10_suspensionrate', 'g11_suspensionrate', 'g12_suspensionrate', 'g9_mobilityrateentrantswithdra', 'g10_mobilityrateentrantswithdra', 'g11_mobilityrateentrantswithdra', 'g12_mobilityrateentrantswithdra', 'g9_attendancerate', 'g10_attendancerate', 'g11_attendancerate', 'g12_attendancerate','g9_avg_class_size', 'g10_avg_class_size', 'g11_avg_class_size', 'g12_avg_class_size','g9_studentinstructionalstaffratio', 'g10_studentinstructionalstaffratio', 'g11_studentinstructionalstaffratio', 'g12_studentinstructionalstaffratio','g9_dropoutrate', 'g10_dropoutrate', 'g11_dropoutrate', 'g12_dropoutrate', 'g9_grade12documenteddecisionco', 'g10_grade12documenteddecisionco', 'g11_grade12documenteddecisionco', 'g12_grade12documenteddecisionco','g9_grade12documenteddecisionem', 'g10_grade12documenteddecisionem', 'g11_grade12documenteddecisionem', 'g12_grade12documenteddecisionem','g9_grade12documenteddecisionmi', 'g10_grade12documenteddecisionmi', 'g11_grade12documenteddecisionmi', 'g12_grade12documenteddecisionmi', 'g9_grad12docdec_col_emp', 'g10_grad12docdec_col_emp', 'g11_grad12docdec_col_emp', 'g12_grad12docdec_col_emp', 'g9_graduationrate', 'g10_graduationrate', 'g11_graduationrate', 'g12_graduationrate','g9_studentsmeetinguniversitysyste', 'g10_studentsmeetinguniversitysyste', 'g11_studentsmeetinguniversitysyste', 'g12_studentsmeetinguniversitysyste', 'g9_Est_Households_2012', 'g10_Est_Households_2012', 'g11_Est_Households_2012', 'g12_Est_Households_2012','g9_Est_Population_2012', 'g10_Est_Population_2012', 'g11_Est_Population_2012', 'g12_Est_Population_2012', 'g9_Med_Household_Income_2012', 'g10_Med_Household_Income_2012', 'g11_Med_Household_Income_2012', 'g12_Med_Household_Income_2012', 'g9_Mean_Household_Income_2012', 'g10_Mean_Household_Income_2012', 'g11_Mean_Household_Income_2012', 'g12_Mean_Household_Income_2012', 'g9_Pop_Below_Poverty_2012', 'g10_Pop_Below_Poverty_2012', 'g11_Pop_Below_Poverty_2012', 'g12_Pop_Below_Poverty_2012', 'g9_Percent_Below_Poverty_2012', 'g10_Percent_Below_Poverty_2012', 'g11_Percent_Below_Poverty_2012', 'g12_Percent_Below_Poverty_2012', 'g9_Pop_Under18_2012', 'g10_Pop_Under18_2012', 'g11_Pop_Under18_2012', 'g12_Pop_Under18_2012', 'g9_Under18_Below_Poverty_2012', 'g10_Under18_Below_Poverty_2012', 'g11_Under18_Below_Poverty_2012', 'g12_Under18_Below_Poverty_2012', 'g9_Under18_Below_Poverty_Percent_2012', 'g10_Under18_Below_Poverty_Percent_2012', 'g11_Under18_Below_Poverty_Percent_2012', 'g12_Under18_Below_Poverty_Percent_2012', 'g9_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g10_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g11_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g12_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g9_Housholds_Pop_on_Food_Stamps_2012', 'g10_Housholds_Pop_on_Food_Stamps_2012', 'g11_Housholds_Pop_on_Food_Stamps_2012', 'g12_Housholds_Pop_on_Food_Stamps_2012', 'g9_Pop_BlackAA_2012', 'g10_Pop_BlackAA_2012', 'g11_Pop_BlackAA_2012', 'g12_Pop_BlackAA_2012', 'g9_Pop_White_2012', 'g10_Pop_White_2012', 'g11_Pop_White_2012', 'g12_Pop_White_2012', 'g9_Bt_18_24_percent_less_than_High_School_2012', 'g10_Bt_18_24_percent_less_than_High_School_2012', 'g11_Bt_18_24_percent_less_than_High_School_2012', 'g12_Bt_18_24_percent_less_than_High_School_2012', 'g9_Bt_18_24_percent_High_School_2012', 'g10_Bt_18_24_percent_High_School_2012', 'g11_Bt_18_24_percent_High_School_2012', 'g12_Bt_18_24_percent_High_School_2012', 'g9_Bt_18_24_percent_Some_College_or_AA_2012', 'g10_Bt_18_24_percent_Some_College_or_AA_2012', 'g11_Bt_18_24_percent_Some_College_or_AA_2012', 'g12_Bt_18_24_percent_Some_College_or_AA_2012', 'g9_Bt_1824_percent_BA_or_Higher_2012', 'g10_Bt_1824_percent_BA_or_Higher_2012', 'g11_Bt_1824_percent_BA_or_Higher_2012', 'g12_Bt_1824_percent_BA_or_Higher_2012', 'g9_Over_25_percent_less_than_9th_grade_2012', 'g10_Over_25_percent_less_than_9th_grade_2012', 'g11_Over_25_percent_less_than_9th_grade_2012', 'g12_Over_25_percent_less_than_9th_grade_2012', 'g9_Over_25_percent_9th_12th_2012', 'g10_Over_25_percent_9th_12th_2012', 'g11_Over_25_percent_9th_12th_2012', 'g12_Over_25_percent_9th_12th_2012', 'g9_Over_25_percent_High_School_2012', 'g10_Over_25_percent_High_School_2012', 'g11_Over_25_percent_High_School_2012', 'g12_Over_25_percent_High_School_2012', 'g9_Over_25__percent_Some_College_No_Deg_2012', 'g10_Over_25__percent_Some_College_No_Deg_2012', 'g11_Over_25__percent_Some_College_No_Deg_2012', 'g12_Over_25__percent_Some_College_No_Deg_2012', 'g9_Over_25_percent_AA_2012', 'g10_Over_25_percent_AA_2012', 'g11_Over_25_percent_AA_2012', 'g12_Over_25_percent_AA_2012', 'g9_Over_25_percent_Bachelors_2012', 'g10_Over_25_percent_Bachelors_2012', 'g11_Over_25_percent_Bachelors_2012', 'g12_Over_25_percent_Bachelors_2012', 'g9_Over_25_percent_Graduate_or_Professionals_2012', 'g10_Over_25_percent_Graduate_or_Professionals_2012', 'g11_Over_25_percent_Graduate_or_Professionals_2012', 'g12_Over_25_percent_Graduate_or_Professionals_2012'] ml.replace_with_mean(df, neighborhood_cols) summary = ml.summarize(df) print summary.T #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv') ml.print_to_csv(df, 'data/imputed_data.csv') #ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv') print "Done!"
summary = ml.summarize(df) print summary.T #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv') ml.print_to_csv(df, 'data/imputed_data.csv') #ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv') print "Done!" #------------------------------------------------------- if __name__ == '__main__': dataset = "data/cohort1_all_school.csv" #dataset = "data/cohort2_all_school.csv" #dataset = "/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/cohort1_all_school.csv" #df = summarize_data(dataset) df = ml.read_data(dataset) #clean_data(df, 'cohort1') #clean_data(df, 'cohort2') #non_dummy_data = 'data/predummy_data.csv' #non_dummy_data = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/predummy_data.csv' #deal_with_dummies(non_dummy_data) clean_dataset = 'data/clean_data.csv' #clean_dataset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data.csv' impute_data(clean_dataset, 'cohort1') #impute_data(clean_dataset, 'cohort2')
import pipeline as pl df = pl.read_data('credit-data.csv') training, testing, targets = pl.split_testing(df, 0.2, 'SeriousDlqin2yrs') training = pl.fill_empty(training) testing = pl.fill_empty(testing) pl.explore_data(training) pl.scatter_data(training, 'SeriousDlqin2yrs') pl.scatter_data(training, 'MonthlyIncome') pl.bucket_continuous( training, 'age', 'age_cat', bins=9, names=['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s']) pl.dummy_categories(training, 'age_cat') pl.bucket_continuous( testing, 'age', 'age_cat', bins=9, names=['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s']) pl.dummy_categories(testing, 'age_cat') features = [ 'RevolvingUtilizationOfUnsecuredLines', 'age',
def impute_data(df, cohort): #import IPython #IPython.embed() if isinstance(df, str): df = ml.read_data(df) ######################### ## IMPUTE MISSING DATA ## ######################### print "Imputing missing data..." #change msam to missing is msam_NA==1 nanList = [ 'g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan' ] varList = [ ['g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'], ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'], ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'], ['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient'] ] for x in range(0, len(nanList)): nacol = nanList[x] colList = varList[x] for col in colList: df.loc[df[nacol] == 1, col] = np.nan #pred missing data using any available data wordList = [ 'absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient', 'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained' ] for word in wordList: colList = [col for col in df.columns if word in col] rowMean = df[colList].mean(axis=1) for col in colList: print df[col].value_counts(dropna=False) df.loc[:, col].fillna(rowMean, inplace=True) print df[col].value_counts(dropna=False) ''' ############################ # IMPUTE NEIGHBORHOOD DATA # ############################ print "Imputing missing school neighborhood data..." ## Fill missing school neighborhood data print "Fixing neighborhood columns..." neighborhood_cols = ['suspensionrate', 'mobilityrateentrantswithdra', 'attendancerate', 'avg_class_size', 'studentinstructionalstaffratio', 'dropoutrate', 'grade12documenteddecisionco', 'grade12documenteddecisionem', 'grade12documenteddecisionmi', 'grad12docdec_col_emp', 'graduationrate', 'studentsmeetinguniversitysyste', 'Est_Households_2012', 'Est_Population_2012', 'Med_Household_Income_2012', 'Mean_Household_Income_2012', 'Pop_Below_Poverty_2012', 'Percent_Below_Poverty_2012', 'Pop_Under18_2012', 'Under18_Below_Poverty_2012', 'Under18_Below_Poverty_Percent_2012', 'Housholds_on_Food_stamps_with_Children_Under18_2012', 'Housholds_Pop_on_Food_Stamps_2012', 'Pop_BlackAA_2012', 'Pop_White_2012', 'Bt_18_24_percent_less_than_High_School_2012', 'Bt_18_24_percent_High_School_2012', 'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012', 'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012', 'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012', 'Over_25_percent_AA_2012', 'Over_25_percent_Bachelors_2012', 'Over_25_percent_Graduate_or_Professionals_2012'] ml.replace_with_mean(df, neighborhood_cols) ''' #summary = ml.summarize(df) #print summary.T #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv') return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str( cohort) + '.csv' ml.print_to_csv(df, return_file) #IPython.embed() print "Done!" import IPython IPython.embed() return df
import pickle from pipeline import read_data, TRAIN_STATS, process_pipeline, fit_pipeline data = read_data('static/dat/br-raw_2007-2018.csv') totals = read_data('static/dat/totals.csv') spreads = read_data('static/dat/spreads.csv') moneylines = read_data('static/dat/moneylines.csv') data = data.merge(spreads[['abrv', 'date', 'spread']], how='outer', left_on=['team', 'date'], right_on=['abrv', 'date']) data = data.merge(totals[['abrv', 'date', 'total']], how='outer', left_on=['team', 'date'], right_on=['abrv', 'date']) data = data.merge(moneylines[['abrv', 'date', 'moneylines']], how='outer', left_on=['home_abrv', 'date'], right_on=['abrv', 'date']) data = data.merge(moneylines[['abrv', 'date', 'moneylines']], how='outer', left_on=['visitor_abrv', 'date'], right_on=['abrv', 'date'], suffixes=['_home', '_away']) p = process_pipeline().transform(data) train = p.dropna(subset=TRAIN_STATS + [ 'spread_cover', 'spread', 'total_cover', 'total', 'moneylines_home', 'moneylines_away'
def prepare_data(dataset): ####################################################### # Load Credit Data and Run Initial Summary Statistics # ####################################################### print "Loading data..." ## LOAD DATA df = ml.read_data(dataset) variables = list(df.columns.values) ## RUN INITIAL SUMMARY STATISTICS & GRAPH DISTRIBUTIONS summary = ml.summarize(df) #print_to_csv(summary, 'summary_stats.csv') for v in variables: ml.histogram(df, v) ## FOR FUTURE: Drop rows where 'percentage' fields have values > 1 ############################ # Deal with missing values # ############################ print "Handling missing values..." print "Correcting dependents column..." ''' DEPENDENTS: Missing values are likely zeros. If someone didn't provide this info, they likely wouldn't have kids.''' variables = ['NumberOfDependents'] values = [0] ml.replace_with_value(df, variables, values) print "Correcting income column..." '''MONTHLY INCOME: It wouldn't make sense to determine missing values through replacing with a specific value. Instead, impute null values with the mean of income.''' variables = ['MonthlyIncome'] ml.replace_with_mean(df, variables) #ml.print_to_csv(df, 'credit-data-updated.csv') ##################### # Generate Features # ##################### print "Generating features..." ## FIND IMPORTANT FEATURES test_features = np.array(['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']) y = 'SeriousDlqin2yrs' ## Find initial best features #print ml.find_features(df, test_features, y) ## ENGINEER ADDITIONAL FEATURES print "Engineering income buckets..." '''MONTHLY INCOME: Break this into buckets, adjusting for outliers''' df['MonthlyIncome_adjust'] = df.MonthlyIncome.apply(lambda x: ml.adjust_outliers(x, 15000)) ml.bin_variable(df, 'MonthlyIncome_adjust', 15, False) #print pd.value_counts(df['MonthlyIncome_adjust_bins']) print "Engineering age buckets..." '''AGE: Break this into buckets''' bins = [-1] + range(20, 80, 5) + [120] ml.bin_variable(df, 'age', bins, False) #print pd.value_counts(df['age_bins']) #print df.head() ## RECALCULATE IMPORTANT FEATURES new_features = np.array(['MonthlyIncome_adjust_bins', 'age_bins']) all_features = np.hstack((test_features, new_features)) #print all_features #print ml.summarize(df) ## FIND BEST FEATURES #print ml.find_features(df, all_features, y) ### FOR FUTURE: It would be cool to be able to automatically point to the top ### five best features or focus on the features that meet a certain threshold. ### Then I could return that as well for the run_classifiers function. ## PRINT PREPARED DATA TO CSV file_name = "credit-data-clean.csv" ml.print_to_csv(df, file_name) return file_name, y
''' CAPP30254 HW3 Xuan Bu Run Pipeline ''' import pipeline as pl import classifiers as clf from sklearn import metrics import pandas as pd # Step 1: Read Data cols_to_drop = ['teacher_prefix', 'teacher_acctid', 'schoolid',\ 'school_ncesid', 'school_latitude', 'school_longitude',\ 'school_city', 'school_state', 'school_district', 'school_county'] df = pl.read_data('../data/projects_2012_2013.csv', cols_to_drop) # Step 2: Explore Data continuous_vars = [ 'total_price_including_optional_support', 'students_reached' ] categorical_vars = ['school_metro', 'school_charter', 'school_magnet',\ 'primary_focus_subject', 'primary_focus_area',\ 'secondary_focus_subject', 'secondary_focus_area',\ 'resource_type', 'poverty_level', 'grade_level',\ 'eligible_double_your_impact_match'] pl.summary_continuous_vars(df, continuous_vars) for cat in categorical_vars: print(pl.summary_categorical_vars(df, cat)) pl.generate_graph(df, continuous_vars) pl.generate_corr_graph(df)
import pandas as pd import numpy as np import matplotlib import statsmodels.api as sm import os from matplotlib import pyplot as plt import seaborn as sns import pipeline as pipe if __name__ == '__main__': # import data train = pipe.read_data('cs-training.csv') test = pipe.read_data('cs-test.csv') # explore data if 'plots' not in os.listdir(): os.mkdir('plots') os.chdir('plots') pipe.explore_data(train,False,'train') os.chdir('..') # process data train = pipe.process_data(train) test.ix[:,1:] = pipe.process_data(test.ix[:,1:]) heavy_tail_club = ['revolving_utilization_of_unsecured_lines',