missing_data_cond = lambda x: x == '?' cat_cols = (1, 3, 4, 5, 6, 7, 8, 12) n_neighbors = 5 # # drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print 'imputing with one-hot' data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print 'imputing with predicted values from random forest' clf = RandomForestClassifier(n_estimators=100, criterion='gini') data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print 'imputing with predicted values usng SVM' clf = SVM(penalty='l2', loss='squared_hinge', dual=True,
data_dict = {} data_dict['RawData'] = pert_data # drop observations with missing variables print 'imputing with drop' data_dict['Drop'] = imp.drop(pert_data, miss_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols) conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN']
missing_data_cond = lambda x: x == "?" cat_cols = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) n_neighbors = 3 # lower for votes # drop observations with missing variables # print 'imputing with drop' # data_drop = imp.drop(x, missing_data_cond) # replace missing values with random existing values print "imputing with random replacement" data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print "imputing with feature summarization (mode)" summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print "imputing with one-hot" data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print "imputing with predicted values from random forest" clf = RandomForestClassifier(n_estimators=100, criterion="gini") data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print "imputing with predicted values usng SVM" clf = clf = SVM( penalty="l2", loss="squared_hinge",
data_dict = {} data_dict['RawData'] = pert_data # drop observations with missing variables print 'imputing with drop' data_dict['Drop'] = imp.drop(pert_data, miss_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols)