break mu = mu_new return mu # vectorized function return np.apply_along_axis(unit_loc_estimator, axis, arr, maxiter=maxiter, tor=tor) def fit(self, X, axis=0, maxiter=50, tor=0.001): loc = self.loc_estimator(X, axis=axis, maxiter=maxiter, tor=tor) scale = self.scale_estimator(X, axis=axis, maxiter=maxiter, tor=tor) return loc, scale if __name__ == '__main__': m_est = MEstimator() X_train, y_train, X_test, y_test = simulation_setup(n_i=1000, n_o=200, n_t=1000, p=10, sigma_e=0.25) t1 = time.time() m_estimated_scale = m_est.scale_estimator(X_train, maxiter=50) m_estimated_loc = m_est.loc_estimator(X_train, maxiter=50) standard_deviation = np.std(X_train, ddof=1, axis=0) median = np.median(X_train, axis=0) print('consumed time: %.5f s' % (time.time() - t1))
""" computes accuracy of the model :param X_test: ndarray, shape(n_samples, n_features) Test data :param y_test: ndarray, shape(n_samples,) Labels of test data :param prob_threshold: double, default: 0.5 probability threshold for determining the predicted labels :return: double accuracy of the logistic regression model. """ y_test = np.array(y_test, dtype=int) y_predict = self.predict(X_test, prob_threshold=prob_threshold) accuracy = np.mean(y_predict == y_test) return accuracy if __name__ == '__main__': data_train, data_test, beta_actual = simulation_setup(n_i=1000, n_o=200, n_t=1000, p=10, sigma_e=0.25) X_train, y_train = data_train[:, :-1], data_train[:, -1] X_test, y_test = data_test[:, :-1], data_test[:, -1] classical_bootstrap = ClassicalBootstrap() classical_bootstrap.fit(X_train, y_train) print("classical bootstrap score: ", classical_bootstrap.score(X_test, y_test))
n_strata_arr = np.arange(2, 16, 2, dtype=int) # lambda_ = np.arange(0.05, 0.55, 0.05) n_i = 1000 # n_o_arr = (n_i * lambda_).astype(int) score_strat_arr = [] # score_boot_arr = [] # for n_o in n_o_arr: for n_strata in n_strata_arr: score_strat = [] # score_boot = [] for i in range(10): X_train, y_train, X_test, y_test = simulation_setup(n_i=n_i, n_o=200, n_t=n_i, p=p) # stratified strat = StratifiedBootstrap() strat.fit(X_train, y_train, n_bootstrap=5, n_strata=n_strata, fast=True) score_strat.append(strat.score(X_test, y_test)) # Bootstrap boot = ClassicalBootstrap() boot.fit(X_train, y_train, n_bootstrap=5) # score_boot.append(boot.score(X_test, y_test))
def predict(self, X_test): if self.beta is None: raise ValueError("MLE Model is not fitted yet") # X_test = np.concatenate((np.ones(X_test.shape[0], 1), X_test), axis=1) return self.probability(self.beta, X_test) def accuracy(self, X_test, y_test, prob_threshold=0.5): y_test = np.array(y_test, dtype=int) y_predicted = (self.predict(X_test) >= prob_threshold).astype(int) accuracy = np.mean(y_predicted == y_test) return accuracy if __name__ == '__main__': data_train, data_test, beta_actual = simulation_setup(n_i=1000, n_o=0, n_t=1000, p=20) X_train, y_train = data_train[:, :-1], data_train[:, -1].astype(int) X_test, y_test = data_test[:, :-1], data_test[:, -1].astype(int) mle = MLE() mle.fit(X_train, y_train) acc = mle.accuracy(X_test, y_test) print('accuracy:', acc) print('fitted coefficients: \n', mle.beta) print('actual coefficients: \n', beta_actual) lr = LogisticRegression(fit_intercept=False, solver='lbfgs') lr.fit(X_train, y_train) score = lr.score(X_test, y_test) print('LR accuracy: ', score) print('LR coefficients: \n', lr.coef_[0]) print('RMSE: ', np.sqrt(np.mean((mle.beta - lr.coef_[0])**2)))
import numpy as np import matplotlib.pyplot as plt import matplotlib matplotlib.use('MacOSX') from rblr.influence_function_bootstrap import IFB from rblr.classical_bootstrap import ClassicalBootstrap from sklearn.linear_model import LogisticRegression from rblr.simulation_setup import simulation_setup matplotlib.rcParams['font.size'] = 10 q = np.linspace(0.1, 1, 20) b = 10 X_train, y_train, X_test, y_test = simulation_setup(n_i=1000, n_o=200, p=8) lr = LogisticRegression(solver='lbfgs') lr.fit(X_train, y_train) score_lr = lr.score(X_test, y_test) class_boot = ClassicalBootstrap() class_boot.fit(X_train, y_train, n_bootstrap=b) score_class_boot = class_boot.score(X_test, y_test) score_ifb = [] for q_ in q: ifb = IFB(c=None, gamma=5) ifb.fit(X_train, y_train, n_bootstrap=b, quantile_factor=q_) score_ifb.append(ifb.score(X_test, y_test)) f1 = plt.figure(1, figsize=(7, 4.8)) plt.plot(q, score_ifb, label='IFB')
from rblr.simulation_setup import simulation_setup from rblr.influence_function_bootstrap import IFB from sklearn.metrics import precision_recall_fscore_support from rblr.preprocessing import Preprocessor import matplotlib.pyplot as plt n_i = 1000 n_o = 200 n_t = 1000 p = 20 sigma_e = 0.25 quantile_factor = np.arange(0.5, 1.0, 0.05) X_train, y_train, X_test, y_test = simulation_setup(n_i=n_i, n_o=n_o, n_t=n_t, p=p, sigma_e=sigma_e) # print("Simulation setup: inliers: {}; outliers: {}; dimensions: {}".format(n_i, n_o, n_t, p, sigma_e)) # def get_metrics_matrix(quantile_factor): # metrics_matrix = np.empty((len(quantile_factor), 3, 4)) # for i, q in enumerate(quantile_factor): # unit_matrix = np.zeros((3, 4)) # clf_ifb = IFB() # clf_ifb.fit(X_train, y_train, quantile_factor=q) # beta_ifb = clf_ifb.beta # # clf_lr = LogisticRegression(fit_intercept=False, solver='lbfgs') # clf_lr.fit(X_train, y_train) #
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report import pandas as pd import matplotlib.pyplot as plt import numpy as np import matplotlib matplotlib.use('Qt5Agg') plt.style.use('ggplot') n_i = 10000 n_o = 1000 p = 20 data_train, data_test, beta = simulation_setup(n_i=n_i, n_o=n_o, p=p, sigma_o=100, sigma_e=0.1) df_train = pd.DataFrame(data=data_train) df_test = pd.DataFrame(data=data_test) # df_train = pd.read_csv('data_train.csv') df_train[p] = df_train[p].astype(int) # store the numpy array representation of the dataframe X = df_train.drop([p], axis=1).values me = MEstimator() # estimated location of inliers loc_estimated = me.loc_estimator(X) scale_estimated = me.scale_estimator(X)
X_out = X[~inlier_flag] if y is None: if return_outliers: return X_in, X_out else: return X_in else: y_in, y_out = y[inlier_flag], y[~inlier_flag] if return_outliers: return X_in, y_in, X_out, y_out else: return X_in, y_in def fit_transform(self, X, y=None, return_outliers=False, n_inliers=None): self.fit(X, y) return self.transform(X, y, return_outliers=return_outliers, n_inliers=n_inliers) if __name__ == '__main__': data = simulation_setup(n_i=1000, n_o=800, p=8)[0] X, y = data[:, :-1], data[:, -1] preprocessor = Preprocessor() # X_in, y_in, X_out, y_out = preprocessor.fit_transform(X, y, return_outliers=True) # # print('number of X_in: %d, y_in: %d' % (len(X_in), len(y_in))) # # print('number of X_out: %d, y_out: %d' % (len(X_out), len(y_out))) X_in = preprocessor.fit_transform(X) print(X_in.shape)