def drop_vals(perc, c): """ drop vals giving log prob""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = df.drop_na() clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return -scipy.stats.norm(drug_preds, std * 1).logpdf(drug_true).sum()
def knn_fill_conf(perc, c): """knn fill that provides the conf intervals""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.KNN(k=3).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return sum(scipy.stats.norm(drug_preds, std * 1).pdf(drug_true) < 0.05)
def mean_fill(perc, c): """mean fill that provies the log probability""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return -scipy.stats.norm(drug_preds, std * 1).logpdf(drug_true).sum()
def dec_mean(perc, c): """ Decision tree classifier with mean imputation """ clf = DecisionTreeClassifier() df, y = glm_testing.create_missing(perc=perc, c=c) y = y.apply(lambda x: 1 if x > 1 else 0) drug_vals, drug_true = glm_testing.test_drug(c=c) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
def log_mean(perc, c, extras=0): """ Logistic regression with mean imputation """ clf = LogisticRegression(solver="liblinear") df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras) y = y.apply(lambda x: 1 if x > 1 else 0) drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) return (np.log(clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod() * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()))
def mean_fill_conf(perc, c): """mean fill that provides the conf intervals""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.SimpleFill(fill_method='mean').fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return sum( scipy.stats.norm( drug_preds, std * 1).pdf(drug_true) < 0.05) # 95 percent confidence
def multi_imp_conf(perc, c, m=10): """multiple imputations for conf intervals""" df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) prob_sum = np.zeros(20) # guassian probs sum for k in range(m): clf = BayesianRidge() design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) prob_sum += (1 / m) * scipy.stats.norm(drug_preds, std * 1).pdf(drug_true) return sum(prob_sum < 0.05) # 95 percent
def multi_imp(perc, c, m=10): """multiple imputation for log prob""" df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) drug_dists = [] prob_sum = 0 # guassian probs sum for k in range(m): clf = BayesianRidge() design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) prob_sum += (1 / m) * scipy.stats.norm(drug_preds, std * 1).pdf(drug_true).prod() return -np.log(prob_sum)
def dec_multi(perc, c, m=10): """ Decision tree classifier with multiple imputation """ clf = DecisionTreeClassifier() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) y = y.apply(lambda x: 1 if x > 1 else 0) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) drug_dists = [] prob_sum = 0 for k in range(m): design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod( ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()) return np.log(prob_sum)
def log_multi(perc, c, m=10, extras=0): """ Logistic regression classifier with multiple imputation """ clf = LogisticRegression(solver="liblinear") df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras) drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras) y = y.apply(lambda x: 1 if x > 1 else 0) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) drug_dists = [] prob_sum = 0 # guassian probs sum for k in range(m): design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod( ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()) return np.log(prob_sum)