print 'imputing with random replacement' data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print 'imputing with one-hot' data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print 'imputing with predicted values from random forest' clf = RandomForestClassifier(n_estimators=100, criterion='gini') data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print 'imputing with predicted values usng SVM' clf = SVM(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)
# drop observations with missing variables print 'imputing with drop' data_dict['Drop'] = imp.drop(pert_data, miss_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols) conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] methods = ['RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] color_mapping = {}
print "imputing with random replacement" data_replace = imp.replace(x, missing_data_cond) # replace missing values with feature summary print "imputing with feature summarization (mode)" summ_func = lambda x: mode(x)[0] data_mode = imp.summarize(x, summ_func, missing_data_cond) # replace categorical features with one hot row print "imputing with one-hot" data_onehot = imp.binarize_data(x, cat_cols) # replace missing data with predictions using random forest print "imputing with predicted values from random forest" clf = RandomForestClassifier(n_estimators=100, criterion="gini") data_rf = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using SVM print "imputing with predicted values usng SVM" clf = clf = SVM( penalty="l2", loss="squared_hinge", dual=True, tol=0.0001, C=1.0, multi_class="ovr", fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None,
# drop observations with missing variables print 'imputing with drop' data_dict['Drop'] = imp.drop(pert_data, miss_data_cond) # replace missing values with random existing values print 'imputing with random replacement' data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond) # replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols) conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] methods = [ 'RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN' ]