import matplotlib.pyplot as plt import numpy as np import pandas as pd import matplotlib.pyplot as plt import load_data as ld training_file = "../data/census_income_learn.csv" metadata_file = "../data/census_income_metadata.txt" df = ld.prepare_dataframe(training_file, metadata_file=metadata_file) dpi=150 figsize=(64, 48) plt.figure(figsize=figsize, dpi=dpi) axes = pd.tools.plotting.scatter_matrix(df, alpha=0.02, figsize=figsize) plt.tight_layout() plt.savefig('./figures/pairwise.png', dpi=dpi) ## list numerical vs categorical variables colnames = df.columns.values is_numerical = np.array([df[c].is_numeric() for c in colnames]) is_categorical = np.logical_not(is_numerical) numerical_variables = list(colnames[is_numerical]) numerical_variables.remove("detailed industry recode") numerical_variables.remove("detailed occupation recode")
print "Score summary: ", round(float(np.trace(confusion_score))/len(y_valid), 3)*100., "%" # read confusion matrix as follows: # true = earn 50000+ # (expected false, predicted false) (expected false, predicted true) # (expected true, predicted false) (expected true, predicted true) ## LOAD DATA print "loading data..." ### basic operation on load data train = ld.prepare_dataframe(TRAINING_FILE, metadata_file=METADATA_FILE) valid = ld.prepare_dataframe(VALIDATION_FILE, metadata_file=METADATA_FILE) train, valid = feat.engineer(train,valid) #pre-process data ### shortcuts features_train = train.drop(PREDICTION_COLNAME, axis=1) features_valid = valid.drop(PREDICTION_COLNAME, axis=1) target_train = train[PREDICTION_COLNAME] target_valid = valid[PREDICTION_COLNAME] ## SELECT FEATURES ### this is supposed to be step1 in sklearn pipeline, but pipeline bugs with python-2.7 print "selecting features..." features_train, features_valid = selector.reduce_dimension(features_train, features_valid)