print 'imputing with predicted values usng logistic regression' clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1) data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with values obtained after factor analysis print 'imputing with factor analysis' data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols) def compute_histogram(data, labels): histogram = itemfreq(sorted(data)) for label in labels: if label not in histogram[:, 0]: histogram = np.vstack( (histogram, np.array([[label, 0]], dtype=object))) histogram = histogram[histogram[:, 0].argsort()] return histogram # compute histograms labels = np.unique(x[:, 1]) freq_data = {}
# replace missing values with feature summary print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols) conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] methods = ['RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] color_mapping = {} for i in xrange(len(methods)): color_mapping[methods[i]] = (i+1) / float(len(methods)) ########################### # plot confusion matrices # ########################### fig, axes = plt.subplots(len(miss_data_cols), len(conf_methods), figsize=(8, 8))
max_iter=1000, ) data_svm = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with predictions using logistic regression print "imputing with predicted values usng logistic regression" clf = LogisticRegression(penalty="l2", dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1) data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf) # replace missing data with values obtained after factor analysis print "imputing with factor analysis" data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond) # replace missing data with knn print "imputing with K-Nearest Neighbors" data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols) def compute_histogram(data, labels): histogram = itemfreq(sorted(data)) for label in labels: if label not in histogram[:, 0]: histogram = np.vstack((histogram, np.array([[label, 0]], dtype=object))) histogram = histogram[histogram[:, 0].argsort()] return histogram # compute histograms labels = np.unique(x[:, 1]) freq_data = {} freq_data["Raw data"] = compute_histogram(x[:, 1], labels)
print 'imputing with feature summarization (mode)' summ_func = lambda x: mode(x)[0] data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond) # replace missing data with predictions using random forest print 'imputing with Random Forest' data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond) # replace missing data with values obtained after factor analysis print 'imputing with PCA' data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond) # replace missing data with knn print 'imputing with K-Nearest Neighbors' data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond, cat_cols) conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN'] methods = [ 'RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN' ] color_mapping = {} for i in xrange(len(methods)): color_mapping[methods[i]] = (i + 1) / float(len(methods)) ########################### # plot confusion matrices # ########################### fig, axes = plt.subplots(len(miss_data_cols),