def deal_with_dummies(df, cohort):

    if isinstance(df, str):
        df = ml.read_data(df)
    
    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    school_ids = [col for col in df.columns if 'school_id' in col]
    df[school_ids] = df.loc[:,school_ids].astype(str, copy=False)

    string_cols = list(df.select_dtypes(include=['object']))
    
    dummys = pd.get_dummies(df[string_cols], dummy_na=True)
    df = pd.concat([df, dummys], axis=1)
    
    df.drop(string_cols, axis=1, inplace=True)

    ## Save clean version
    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str(cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
def impute_data(df, cohort):

    #import IPython
    #IPython.embed()

    if isinstance(df, str):
        df = ml.read_data(df)

    #########################
    ## IMPUTE MISSING DATA ##
    #########################
    print "Imputing missing data..."

    #change msam to missing is msam_NA==1
    nanList =  ['g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan']
    varList = [[ 'g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'], ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'], ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'],['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient']]
    for x in range(0,len(nanList)):
        nacol = nanList[x]
        colList = varList[x]
        for col in colList:
            df.loc[df[nacol] == 1, col] = np.nan 


    #pred missing data using any available data
    wordList = ['absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient', 'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained']
    for word in wordList:
        colList = [col for col in df.columns if word in col]
        rowMean = df[colList].mean(axis=1)
        for col in colList:
            print df[col].value_counts(dropna=False)
            df.loc[:,col].fillna(rowMean, inplace=True)
            print df[col].value_counts(dropna=False)


    '''
    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)
    '''

    #summary = ml.summarize(df)
    #print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str(cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    #IPython.embed()

    print "Done!"
    import IPython
    IPython.embed()
    return df
def choose_columns(df, grade):
    print "Choosing data..."

    if isinstance(df, str):
        df = ml.read_data(df)

    #Find columns to use
    print "Choosing columns..."
    all_columns = list(df.columns.values)
    cols_to_use = []

    i = grade
    prefixes = []

    while i <= 12:
        prefixes.append('g' + str(i))
        i+=1

    for col in all_columns:
        for p in prefixes:
            if not col.startswith(p):
                if col not in cols_to_use:
                    cols_to_use.append(col)

    for index, val in enumerate(cols_to_use):
        if val.startswith('Unnamed'):
            cols_to_use.pop(index)

    y = 'g' + str(grade) + '_dropout'

    return cols_to_use, y
Пример #4
0
def choose_columns(df, grade):
    print "Choosing data..."

    if isinstance(df, str):
        df = ml.read_data(df)

    #Find columns to use
    print "Choosing columns..."
    all_columns = list(df.columns.values)
    cols_to_use = []

    i = grade
    prefixes = []

    while i <= 12:
        prefixes.append('g' + str(i))
        i += 1

    for col in all_columns:
        for p in prefixes:
            if not col.startswith(p):
                if col not in cols_to_use:
                    cols_to_use.append(col)

    for index, val in enumerate(cols_to_use):
        if val.startswith('Unnamed'):
            cols_to_use.pop(index)

    y = 'g' + str(grade) + '_dropout'

    return cols_to_use, y
Пример #5
0
def deal_with_dummies(df, cohort):

    if isinstance(df, str):
        df = ml.read_data(df)

    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    school_ids = [col for col in df.columns if 'school_id' in col]
    df[school_ids] = df.loc[:, school_ids].astype(str, copy=False)

    string_cols = list(df.select_dtypes(include=['object']))

    dummys = pd.get_dummies(df[string_cols], dummy_na=True)
    df = pd.concat([df, dummys], axis=1)

    df.drop(string_cols, axis=1, inplace=True)

    ## Save clean version
    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str(
        cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
Пример #6
0
def deal_with_dummies(dataset):
    
    df = ml.read_data(dataset)

    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    string_cols = list(df.select_dtypes(include=['object']))
    print string_cols

    df = ml.get_dummys(df, string_cols, dummy_na=True)
    for col in string_cols:
        print col
        df.drop(col, axis=1, inplace=True)

    ## Save clean version
    ml.print_to_csv(df, 'data/clean_data.csv')
Пример #7
0
def run_classifiers(csv_file, y):

	## LOAD PREPARED DATA
	df = ml.read_data(csv_file)

	################################
	# Build & Evaluate Classifiers #
	################################
	print "Evaluating classifiers..."

	## USE TOP FEATURES TO COMPARE CLASSIFIER PERFORMACE
	features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
					'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate',
					'NumberOfOpenCreditLinesAndLoans']

	X = df[features].as_matrix()
	y = df[y].as_matrix()

	#print ml.build_classifiers(X,y)
	ml.print_to_csv(ml.build_classifiers(X, y), 'compare_classifiers.csv')
Пример #8
0
def summarize_data(dataset):

    ###############
    ## LOAD DATA ##
    ###############

    print "Loading data..."

    df = ml.read_data(dataset)
    variables = list(df.columns.values)
    #print variables

    ####################################
    ## RUN INITIAL SUMMARY STATISTICS ##
    ####################################
    print "Running summary statistics..."

    ml.summarize_dataset(dataset)
    for v in variables:
        ml.summary_statistics(v, dataset, 5, 10)

    return df
Пример #9
0
'''
CAPP30254 HW5
Xuan Bu
Run Pipeline
'''

import pipeline as pl
import classifiers as clf
import evaluation as el
import temporal_validation as tv
import pandas as pd

### Step 1: Read Data
df = pl.read_data('projects_2012_2013.csv')

### Step 2: Explore Data
continuous_vars = [
    'total_price_including_optional_support', 'students_reached'
]
categorical_vars = ['teacher_prefix', 'school_metro', 'school_charter',\
                    'school_magnet', 'primary_focus_subject', 'primary_focus_area',\
                    'secondary_focus_subject', 'secondary_focus_area',\
                    'resource_type', 'poverty_level', 'grade_level',\
                    'eligible_double_your_impact_match']

pl.summary_continuous_vars(df, continuous_vars)
for cat in categorical_vars:
    print(pl.summary_categorical_vars(df, cat))
pl.generate_graph(df, continuous_vars)
pl.generate_corr_graph(df)
outliers = pl.count_outliers(df, continuous_vars)
#    summarize_data(df)
#    summarize_data(test)

    ## CLEAN DATA
#    print "Cleaning Cohort 1..."
#    predummy_cohort1 = clean_data(df, 1)
    
#    print "Cleaning Cohort 2..."
#    predummy_cohort2 = clean_data(test, 2)

#    clean_cohort1 = deal_with_dummies(predummy_cohort1, 1)
#    clean_cohort2 = deal_with_dummies(predummy_cohort2, 2)

    ## TRAINING DATA: CHOOSE SUBSET
    clean_cohort1 = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv'
    df = ml.read_data(clean_cohort1)
    cols_to_use, y = choose_columns(df, 12)
    rows = choose_rows(df, 12)

#    X = df[cols_to_use]
#    y = df[y]
    #import IPython
    #IPython.embed() 
    ## TRAINING DATA: IMPUTATION
#    subset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv'
    X = impute_data(rows[cols_to_use], 1)

    #IPython.embed()
    ## TRAINING DATA: START K-FOLD WITH CORRECT DATA
#    imputed_dataset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv'
#    df = ml.read_data(imputed_dataset)
Пример #11
0
import pandas as pd
import numpy as np
import matplotlib
import statsmodels.api as sm
import os
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

import pipeline as pipe


if __name__ == '__main__':

    train = pipe.read_data('cs-training.csv')  
    train = pipe.fill_missing(train)
    original_features = train.columns[1:]
    response_var = train.columns[0]
    
    heavy_tail_club = ['revolving_utilization_of_unsecured_lines',
                       'debt_ratio','monthly_income',
                       'number_of_time30-59_days_past_due_not_worse']
    heavy_tail_club_cutoffs = [99,90,90,90]
    train = pipe.trim_tails(train,heavy_tail_club,heavy_tail_club_cutoffs)  
    pipe.explore_data(train[heavy_tail_club])
    
    train = pipe.replace_value(train,['age'],0,np.median(train['age']))
    
    train = pipe.robust_scale_data(train,original_features)
    
Пример #12
0
def impute_data(dataset, cohort):

    df = ml.read_data(dataset)

    ##########################
    ## IMPUTE ACADEMIC DATA ##
    ##########################

    print "Impute missing academic information..."

    ## Fill missing school data -- use mean imputation for now
    school_vars = ['g6_school_id', 'g7_school_id', 'g8_school_id', 'g9_school_id', 'g10_school_id', 'g11_school_id', 'g12_school_id']
    ml.replace_with_mean(df, school_vars)

    ## Fill missing grade and test score information -- use mean imputation for now
    grades_tests = ['g6_q1mpa', 'g6_q2mpa', 'g6_q3mpa', 'g6_q4mpa', 'g6_g6mapr','g7_q1mpa', 'g7_q2mpa', 'g7_q3mpa', 'g7_q4mpa', 'g7_g7mapr', 'g8_q1mpa', 'g8_q2mpa', 'g8_q3mpa', 'g8_q4mpa', 'g8_g8mapr', 'g9_q1mpa', 'g9_q2mpa', 'g9_q3mpa', 'g9_q4mpa', 'g9_g8mapr', 'g10_q1mpa', 'g10_q2mpa', 'g10_q3mpa', 'g10_q4mpa', 'g10_psatv', 'g10_psatm', 'g11_q1mpa', 'g11_q2mpa', 'g11_q3mpa', 'g11_q4mpa', 'g11_psatv', 'g11_psatm', 'g12_q1mpa', 'g12_q2mpa', 'g12_q3mpa', 'g12_q4mpa', 'g12_psatv', 'g12_psatm']
    ml.replace_with_mean(df, grades_tests)

    ## Fill in missing id with dummy
    ml.replace_with_value(df, 'id', 0)

    ## Fill missing MSAM data
    g6_msam = ['g6_g6msam_Advanced','g6_g6msam_Basic','g6_g6msam_Proficient']
    ml.replace_dummy_null_mean(df, 'g6_g6msam_nan', g6_msam)

    if cohort == 'cohort1':
        g7_msam = ['g7_g7msam_Advanced','g7_g7msam_Basic','g7_g7msam_Proficient']
        ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam)
    elif cohort == 'cohort2':
        g7_msam = ['g7_g7msam_ ','g7_g7msam_1','g7_g7msam_2', 'g7_g7msam_3']
        ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam)

    g8_msam = ['g8_g8msam_Advanced','g8_g8msam_Basic','g8_g8msam_Proficient']
    ml.replace_dummy_null_mean(df, 'g8_g8msam_nan', g8_msam)

    g9_msam = ['g9_g8msam_Advanced','g9_g8msam_Basic','g9_g8msam_Proficient']
    ml.replace_dummy_null_mean(df,'g9_g8msam_nan', g9_msam)

    
    ############################
    ## IMPUTE BEHAVIORAL DATA ##
    ############################

    print "Impute missing behavioral data..."

    ## Fill missing behavioral data -- use mean imputation for now
    behavioral_cols = ['g6_absrate', 'g6_nsusp','g7_absrate', 'g7_tardyr', 'g7_nsusp', 'g8_absrate', 'g8_tardyr', 'g8_nsusp', 'g9_absrate', 'g9_nsusp', 'g10_absrate', 'g10_nsusp', 'g11_absrate', 'g11_nsusp','g12_absrate', 'g12_nsusp']
    ml.replace_with_mean(df, behavioral_cols)

    ## Fill in missing birthday data
    #ml.replace_with_mean(df, 'birthday')

    ############################
    ## IMPUTE ENROLLMENT DATA ##
    ############################

    print "Imputing missing enrollment data..."

    ## Fill missing enrollment data
    print "Fixing mobility columns..."
    mobility_cols = ['g10_retained', 'g6_mobility', 'g7_mobility', 'g8_mobility', 'g9_mobility', 'g9_retained','g10_mobility', 'g11_mobility', 'g12_mobility', 'birthday']
    # Includes g10_retained because it's coded as 0/1 already
    ml.replace_with_mean(df, mobility_cols)


    #########################
    ## IMPUTE DROPOUT DATA ##
    #########################

    print "Impute missing droput information..."

    ## Fill missing dropout information with 0
    dropout_vars = ['g6_dropout', 'g7_dropout', 'g8_dropout', 'g9_dropout', 'g10_dropout', 'g11_dropout', 'g12_dropout', 'dropout']
    ml.replace_with_value(df, dropout_vars, [0,0,0,0,0,0,0,0])

    #variables = list(df.columns.values)
    #print variables



    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    """
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    """

    neighborhood_cols = ['g9_suspensionrate', 'g10_suspensionrate', 'g11_suspensionrate', 'g12_suspensionrate', 'g9_mobilityrateentrantswithdra', 'g10_mobilityrateentrantswithdra', 'g11_mobilityrateentrantswithdra', 'g12_mobilityrateentrantswithdra', 'g9_attendancerate', 'g10_attendancerate', 'g11_attendancerate', 'g12_attendancerate','g9_avg_class_size', 'g10_avg_class_size', 'g11_avg_class_size', 'g12_avg_class_size','g9_studentinstructionalstaffratio', 'g10_studentinstructionalstaffratio', 'g11_studentinstructionalstaffratio', 'g12_studentinstructionalstaffratio','g9_dropoutrate', 'g10_dropoutrate', 'g11_dropoutrate', 'g12_dropoutrate', 'g9_grade12documenteddecisionco', 'g10_grade12documenteddecisionco', 'g11_grade12documenteddecisionco', 'g12_grade12documenteddecisionco','g9_grade12documenteddecisionem', 'g10_grade12documenteddecisionem', 'g11_grade12documenteddecisionem', 'g12_grade12documenteddecisionem','g9_grade12documenteddecisionmi', 'g10_grade12documenteddecisionmi', 'g11_grade12documenteddecisionmi', 'g12_grade12documenteddecisionmi', 'g9_grad12docdec_col_emp', 'g10_grad12docdec_col_emp', 'g11_grad12docdec_col_emp', 'g12_grad12docdec_col_emp', 'g9_graduationrate', 'g10_graduationrate', 'g11_graduationrate', 'g12_graduationrate','g9_studentsmeetinguniversitysyste', 'g10_studentsmeetinguniversitysyste', 'g11_studentsmeetinguniversitysyste', 'g12_studentsmeetinguniversitysyste', 'g9_Est_Households_2012', 'g10_Est_Households_2012', 'g11_Est_Households_2012', 'g12_Est_Households_2012','g9_Est_Population_2012', 'g10_Est_Population_2012', 'g11_Est_Population_2012', 'g12_Est_Population_2012', 'g9_Med_Household_Income_2012', 'g10_Med_Household_Income_2012', 'g11_Med_Household_Income_2012', 'g12_Med_Household_Income_2012', 'g9_Mean_Household_Income_2012', 'g10_Mean_Household_Income_2012', 'g11_Mean_Household_Income_2012', 'g12_Mean_Household_Income_2012', 'g9_Pop_Below_Poverty_2012', 'g10_Pop_Below_Poverty_2012', 'g11_Pop_Below_Poverty_2012', 'g12_Pop_Below_Poverty_2012', 'g9_Percent_Below_Poverty_2012', 'g10_Percent_Below_Poverty_2012', 'g11_Percent_Below_Poverty_2012', 'g12_Percent_Below_Poverty_2012', 'g9_Pop_Under18_2012', 'g10_Pop_Under18_2012', 'g11_Pop_Under18_2012', 'g12_Pop_Under18_2012', 'g9_Under18_Below_Poverty_2012', 'g10_Under18_Below_Poverty_2012', 'g11_Under18_Below_Poverty_2012', 'g12_Under18_Below_Poverty_2012', 'g9_Under18_Below_Poverty_Percent_2012', 'g10_Under18_Below_Poverty_Percent_2012', 'g11_Under18_Below_Poverty_Percent_2012', 'g12_Under18_Below_Poverty_Percent_2012', 'g9_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g10_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g11_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g12_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g9_Housholds_Pop_on_Food_Stamps_2012', 'g10_Housholds_Pop_on_Food_Stamps_2012', 'g11_Housholds_Pop_on_Food_Stamps_2012', 'g12_Housholds_Pop_on_Food_Stamps_2012', 'g9_Pop_BlackAA_2012', 'g10_Pop_BlackAA_2012', 'g11_Pop_BlackAA_2012', 'g12_Pop_BlackAA_2012', 'g9_Pop_White_2012', 'g10_Pop_White_2012', 'g11_Pop_White_2012', 'g12_Pop_White_2012', 'g9_Bt_18_24_percent_less_than_High_School_2012', 'g10_Bt_18_24_percent_less_than_High_School_2012', 'g11_Bt_18_24_percent_less_than_High_School_2012', 'g12_Bt_18_24_percent_less_than_High_School_2012', 'g9_Bt_18_24_percent_High_School_2012', 'g10_Bt_18_24_percent_High_School_2012', 'g11_Bt_18_24_percent_High_School_2012', 'g12_Bt_18_24_percent_High_School_2012', 'g9_Bt_18_24_percent_Some_College_or_AA_2012', 'g10_Bt_18_24_percent_Some_College_or_AA_2012', 'g11_Bt_18_24_percent_Some_College_or_AA_2012', 'g12_Bt_18_24_percent_Some_College_or_AA_2012', 'g9_Bt_1824_percent_BA_or_Higher_2012', 'g10_Bt_1824_percent_BA_or_Higher_2012', 'g11_Bt_1824_percent_BA_or_Higher_2012', 'g12_Bt_1824_percent_BA_or_Higher_2012', 'g9_Over_25_percent_less_than_9th_grade_2012', 'g10_Over_25_percent_less_than_9th_grade_2012', 'g11_Over_25_percent_less_than_9th_grade_2012', 'g12_Over_25_percent_less_than_9th_grade_2012', 'g9_Over_25_percent_9th_12th_2012', 'g10_Over_25_percent_9th_12th_2012', 'g11_Over_25_percent_9th_12th_2012', 'g12_Over_25_percent_9th_12th_2012', 'g9_Over_25_percent_High_School_2012', 'g10_Over_25_percent_High_School_2012', 'g11_Over_25_percent_High_School_2012', 'g12_Over_25_percent_High_School_2012', 'g9_Over_25__percent_Some_College_No_Deg_2012', 'g10_Over_25__percent_Some_College_No_Deg_2012', 'g11_Over_25__percent_Some_College_No_Deg_2012', 'g12_Over_25__percent_Some_College_No_Deg_2012', 'g9_Over_25_percent_AA_2012', 'g10_Over_25_percent_AA_2012', 'g11_Over_25_percent_AA_2012', 'g12_Over_25_percent_AA_2012', 'g9_Over_25_percent_Bachelors_2012', 'g10_Over_25_percent_Bachelors_2012', 'g11_Over_25_percent_Bachelors_2012', 'g12_Over_25_percent_Bachelors_2012', 'g9_Over_25_percent_Graduate_or_Professionals_2012', 'g10_Over_25_percent_Graduate_or_Professionals_2012', 'g11_Over_25_percent_Graduate_or_Professionals_2012', 'g12_Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)


    summary = ml.summarize(df)
    print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    ml.print_to_csv(df, 'data/imputed_data.csv')
    #ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv')
    print "Done!"
Пример #13
0
    summary = ml.summarize(df)
    print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    ml.print_to_csv(df, 'data/imputed_data.csv')
    #ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv')
    print "Done!"

#-------------------------------------------------------

if __name__ == '__main__':

    dataset = "data/cohort1_all_school.csv"
    #dataset = "data/cohort2_all_school.csv"
    #dataset = "/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/cohort1_all_school.csv"

    #df = summarize_data(dataset)
    df = ml.read_data(dataset)
    #clean_data(df, 'cohort1')
    #clean_data(df, 'cohort2')

    #non_dummy_data = 'data/predummy_data.csv'
    #non_dummy_data = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/predummy_data.csv'
    #deal_with_dummies(non_dummy_data)

    clean_dataset = 'data/clean_data.csv'
    #clean_dataset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data.csv'
    impute_data(clean_dataset, 'cohort1')
    #impute_data(clean_dataset, 'cohort2')
Пример #14
0
import pipeline as pl

df = pl.read_data('credit-data.csv')

training, testing, targets = pl.split_testing(df, 0.2, 'SeriousDlqin2yrs')
training = pl.fill_empty(training)
testing = pl.fill_empty(testing)

pl.explore_data(training)
pl.scatter_data(training, 'SeriousDlqin2yrs')
pl.scatter_data(training, 'MonthlyIncome')

pl.bucket_continuous(
    training,
    'age',
    'age_cat',
    bins=9,
    names=['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s'])
pl.dummy_categories(training, 'age_cat')

pl.bucket_continuous(
    testing,
    'age',
    'age_cat',
    bins=9,
    names=['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s'])
pl.dummy_categories(testing, 'age_cat')

features = [
    'RevolvingUtilizationOfUnsecuredLines',
    'age',
Пример #15
0
def impute_data(df, cohort):

    #import IPython
    #IPython.embed()

    if isinstance(df, str):
        df = ml.read_data(df)

    #########################
    ## IMPUTE MISSING DATA ##
    #########################
    print "Imputing missing data..."

    #change msam to missing is msam_NA==1
    nanList = [
        'g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan'
    ]
    varList = [
        ['g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'],
        ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'],
        ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'],
        ['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient']
    ]
    for x in range(0, len(nanList)):
        nacol = nanList[x]
        colList = varList[x]
        for col in colList:
            df.loc[df[nacol] == 1, col] = np.nan

    #pred missing data using any available data
    wordList = [
        'absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient',
        'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained'
    ]
    for word in wordList:
        colList = [col for col in df.columns if word in col]
        rowMean = df[colList].mean(axis=1)
        for col in colList:
            print df[col].value_counts(dropna=False)
            df.loc[:, col].fillna(rowMean, inplace=True)
            print df[col].value_counts(dropna=False)
    '''
    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)
    '''

    #summary = ml.summarize(df)
    #print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str(
        cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    #IPython.embed()

    print "Done!"
    import IPython
    IPython.embed()
    return df
Пример #16
0
    #    summarize_data(df)
    #    summarize_data(test)

    ## CLEAN DATA
    #    print "Cleaning Cohort 1..."
    #    predummy_cohort1 = clean_data(df, 1)

    #    print "Cleaning Cohort 2..."
    #    predummy_cohort2 = clean_data(test, 2)

    #    clean_cohort1 = deal_with_dummies(predummy_cohort1, 1)
    #    clean_cohort2 = deal_with_dummies(predummy_cohort2, 2)

    ## TRAINING DATA: CHOOSE SUBSET
    clean_cohort1 = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv'
    df = ml.read_data(clean_cohort1)
    cols_to_use, y = choose_columns(df, 12)
    rows = choose_rows(df, 12)

    #    X = df[cols_to_use]
    #    y = df[y]
    #import IPython
    #IPython.embed()
    ## TRAINING DATA: IMPUTATION
    #    subset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort1.csv'
    X = impute_data(rows[cols_to_use], 1)

    #IPython.embed()
    ## TRAINING DATA: START K-FOLD WITH CORRECT DATA
    #    imputed_dataset = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv'
    #    df = ml.read_data(imputed_dataset)
Пример #17
0
import pickle
from pipeline import read_data, TRAIN_STATS, process_pipeline, fit_pipeline

data = read_data('static/dat/br-raw_2007-2018.csv')
totals = read_data('static/dat/totals.csv')
spreads = read_data('static/dat/spreads.csv')
moneylines = read_data('static/dat/moneylines.csv')

data = data.merge(spreads[['abrv', 'date', 'spread']],
                  how='outer',
                  left_on=['team', 'date'],
                  right_on=['abrv', 'date'])
data = data.merge(totals[['abrv', 'date', 'total']],
                  how='outer',
                  left_on=['team', 'date'],
                  right_on=['abrv', 'date'])
data = data.merge(moneylines[['abrv', 'date', 'moneylines']],
                  how='outer',
                  left_on=['home_abrv', 'date'],
                  right_on=['abrv', 'date'])
data = data.merge(moneylines[['abrv', 'date', 'moneylines']],
                  how='outer',
                  left_on=['visitor_abrv', 'date'],
                  right_on=['abrv', 'date'],
                  suffixes=['_home', '_away'])

p = process_pipeline().transform(data)

train = p.dropna(subset=TRAIN_STATS + [
    'spread_cover', 'spread', 'total_cover', 'total', 'moneylines_home',
    'moneylines_away'
Пример #18
0
def prepare_data(dataset):
	
	#######################################################
	# Load Credit Data and Run Initial Summary Statistics #
	#######################################################
	print "Loading data..."

	## LOAD DATA
	df = ml.read_data(dataset)
	variables = list(df.columns.values)
	
	## RUN INITIAL SUMMARY STATISTICS & GRAPH DISTRIBUTIONS
	summary = ml.summarize(df)
	#print_to_csv(summary, 'summary_stats.csv')
	
	for v in variables:
		ml.histogram(df, v)

	## FOR FUTURE: Drop rows where 'percentage' fields have values > 1

	############################
	# Deal with missing values #
	############################
	print "Handling missing values..."

	print "Correcting dependents column..."
	''' DEPENDENTS: Missing values are likely zeros. If someone didn't 
	provide this info, they likely wouldn't have kids.'''
	variables = ['NumberOfDependents']
	values = [0]
	ml.replace_with_value(df, variables, values)

	print "Correcting income column..."
	'''MONTHLY INCOME: It wouldn't make sense to determine missing values
	through replacing with a specific value. Instead, impute null values with
	the mean of income.'''
	variables = ['MonthlyIncome']
	ml.replace_with_mean(df, variables)

	#ml.print_to_csv(df, 'credit-data-updated.csv')

	#####################
	# Generate Features #
	#####################
	print "Generating features..."

	## FIND IMPORTANT FEATURES
	test_features = np.array(['RevolvingUtilizationOfUnsecuredLines', 'age',
							'NumberOfTime30-59DaysPastDueNotWorse',
							'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
							'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
							'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'])
	y = 'SeriousDlqin2yrs'

	## Find initial best features
	#print ml.find_features(df, test_features, y)

	## ENGINEER ADDITIONAL FEATURES
	print "Engineering income buckets..."
	'''MONTHLY INCOME: Break this into buckets, adjusting for outliers'''
	df['MonthlyIncome_adjust'] = df.MonthlyIncome.apply(lambda x: ml.adjust_outliers(x, 15000))	
	ml.bin_variable(df, 'MonthlyIncome_adjust', 15, False)
	#print pd.value_counts(df['MonthlyIncome_adjust_bins'])

	print "Engineering age buckets..."
	'''AGE: Break this into buckets'''
	bins = [-1] + range(20, 80, 5) + [120]
	ml.bin_variable(df, 'age', bins, False)
	#print pd.value_counts(df['age_bins'])
	
	#print df.head()

	## RECALCULATE IMPORTANT FEATURES
	new_features = np.array(['MonthlyIncome_adjust_bins', 'age_bins'])
	all_features = np.hstack((test_features, new_features))
	#print all_features
	#print ml.summarize(df)

	## FIND BEST FEATURES
	#print ml.find_features(df, all_features, y)

	### FOR FUTURE: It would be cool to be able to automatically point to the top
	### five best features or focus on the features that meet a certain threshold.
	### Then I could return that as well for the run_classifiers function.

	## PRINT PREPARED DATA TO CSV
	file_name = "credit-data-clean.csv"
	ml.print_to_csv(df, file_name)

	return file_name, y
Пример #19
0
'''
CAPP30254 HW3
Xuan Bu
Run Pipeline
'''
import pipeline as pl
import classifiers as clf
from sklearn import metrics
import pandas as pd

# Step 1: Read Data
cols_to_drop = ['teacher_prefix', 'teacher_acctid', 'schoolid',\
        'school_ncesid', 'school_latitude', 'school_longitude',\
        'school_city', 'school_state', 'school_district', 'school_county']
df = pl.read_data('../data/projects_2012_2013.csv', cols_to_drop)

# Step 2: Explore Data
continuous_vars = [
    'total_price_including_optional_support', 'students_reached'
]
categorical_vars = ['school_metro', 'school_charter', 'school_magnet',\
                    'primary_focus_subject', 'primary_focus_area',\
                    'secondary_focus_subject', 'secondary_focus_area',\
                    'resource_type', 'poverty_level', 'grade_level',\
                    'eligible_double_your_impact_match']

pl.summary_continuous_vars(df, continuous_vars)
for cat in categorical_vars:
    print(pl.summary_categorical_vars(df, cat))
pl.generate_graph(df, continuous_vars)
pl.generate_corr_graph(df)
Пример #20
0
import pandas as pd
import numpy as np
import matplotlib
import statsmodels.api as sm
import os
from matplotlib import pyplot as plt
import seaborn as sns

import pipeline as pipe



if __name__ == '__main__':
    
    # import data
    train = pipe.read_data('cs-training.csv')
    test = pipe.read_data('cs-test.csv')
    
    # explore data
    if 'plots' not in os.listdir():
        os.mkdir('plots')
        
    os.chdir('plots')
    pipe.explore_data(train,False,'train')  
    os.chdir('..')
    
    # process data
    train = pipe.process_data(train)
    test.ix[:,1:] = pipe.process_data(test.ix[:,1:])
    
    heavy_tail_club = ['revolving_utilization_of_unsecured_lines',