def simu_autocutoff(n_simu, n_samples, n_features, n_cut_points): seed = n_simu cov_corr = .5 sparsity = .2 simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, n_cut_points=n_cut_points, seed=seed, verbose=False, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # binarize data n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) binarizer.fit_transform(X) boundaries = binarizer.boundaries epsilon = 10 tic = time() multiple_testing_rslt = multiple_testing(X, boundaries, Y, delta, epsilon=epsilon) # Lausen & Schumacher correction p_values_corr, p_values_min, cut_points_estimates = [], [], [] n_tested = [] for j in range(n_features): p_values_j = multiple_testing_rslt[j] n_tested.append(p_values_j.values_to_test.shape[0]) p_values_min.append(p_values_j.p_values.min()) p_values_corr.append( p_value_cut(p_values_j.p_values, p_values_j.values_to_test, X[:, j], epsilon)) idx_cut_points = p_values_j.p_values.argmin() cut_points_estimate_j = p_values_j.values_to_test[idx_cut_points] cut_points_estimates.append(cut_points_estimate_j) tac = time() print(p_values_min) return [ n_samples, cut_points, S, cut_points_estimates, p_values_min, n_tested, p_values_corr, tac - tic ]
def test_binarizer_fit(self): """...Test binarizer fit """ n_cuts = 3 enc = OneHotEncoder(sparse=True) expected_binarization = enc.fit_transform( self.default_expected_intervals) binarizer = FeaturesBinarizer(method='quantile', n_cuts=n_cuts, detect_column_type="auto", remove_first=False) # for pandas dataframe binarizer.fit(self.df_features) binarized_df = binarizer.transform(self.df_features) self.assertEqual(binarized_df.__class__, csr.csr_matrix) np.testing.assert_array_equal(expected_binarization.toarray(), binarized_df.toarray()) # for numpy array binarizer.fit(self.features) binarized_array = binarizer.transform(self.features) self.assertEqual(binarized_array.__class__, csr.csr_matrix) np.testing.assert_array_equal(expected_binarization.toarray(), binarized_array.toarray()) # test fit_transform binarized_array = binarizer.fit_transform(self.features) self.assertEqual(binarized_array.__class__, csr.csr_matrix) np.testing.assert_array_equal(expected_binarization.toarray(), binarized_array.toarray())
def get_times2(n_simu, n_samples, n_features, n_cut_points): print(" n_simu=%s" % n_simu) seed = n_simu simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, seed=seed, verbose=False, n_cut_points=n_cut_points, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # Binacox method tic = time() n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, C=25, warm_start=True) learner._solver_obj.linesearch = False learner.fit(X_bin, Y, delta) tac = time() return tac - tic
def test_LogisticRegression_fit(self): """...Test LogisticRegression fit with different solvers and penalties """ sto_seed = 179312 raw_features, y = Test.get_train_data() for fit_intercept in [True, False]: for penalty in penalties: if penalty == 'binarsity': # binarize features n_cuts = 3 binarizer = FeaturesBinarizer(n_cuts=n_cuts) features = binarizer.fit_transform(raw_features) else: features = raw_features for solver in solvers: solver_kwargs = { 'penalty': penalty, 'tol': 1e-5, 'solver': solver, 'verbose': False, 'max_iter': 10, 'fit_intercept': fit_intercept } if penalty != 'none': solver_kwargs['C'] = 100 if penalty == 'binarsity': solver_kwargs['blocks_start'] = binarizer.blocks_start solver_kwargs[ 'blocks_length'] = binarizer.blocks_length if solver == 'sdca': solver_kwargs['sdca_ridge_strength'] = 2e-2 if solver in ['sgd', 'svrg', 'sdca']: solver_kwargs['random_state'] = sto_seed if solver == 'sgd': solver_kwargs['step'] = 1. if solver == 'bfgs': # BFGS only accepts ProxZero and ProxL2sq for now if penalty not in ['none', 'l2']: continue learner = LogisticRegression(**solver_kwargs) learner.fit(features, y) probas = learner.predict_proba(features)[:, 1] auc = roc_auc_score(y, probas) self.assertGreater( auc, 0.7, "solver %s with penalty %s and " "intercept %s reached too low AUC" % (solver, penalty, fit_intercept))
def get_times1(n_simu, n_samples, n_features, n_cut_points): print(" n_simu=%s" % n_simu) seed = n_simu simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, seed=seed, verbose=False, n_cut_points=n_cut_points, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # Binacox method n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length boundaries = binarizer.boundaries['0'] solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, C=25, warm_start=True) learner._solver_obj.linesearch = False learner.fit(X_bin, Y, delta) tac = time() time_bina = tac - tic # Auto Cutoff Method X = np.array(X) epsilon = 10 p1 = np.percentile(X, epsilon) p2 = np.percentile(X, 100 - epsilon) values_to_test = X[np.where((X <= p2) & (X >= p1))] tic = time() get_p_values_j(X, 0, Y, delta, values_to_test, epsilon) tac = time() time_ac_all = tac - tic tic = time() p1 = np.percentile(X, epsilon) p2 = np.percentile(X, 100 - epsilon) values_to_test = boundaries[ np.where((boundaries <= p2) & (boundaries >= p1))] get_p_values_j(X, 0, Y, delta, values_to_test, epsilon) tac = time() time_ac_grid = tac - tic return n_samples, time_bina, time_ac_all, time_ac_grid
def test_CoxRegression_fit(self): """...Test CoxRegression fit with different solvers and penalties """ raw_features, times, censoring = Test.get_train_data() coeffs_pen = { 'none': np.array([ -0.03068462, 0.03940001, 0.16758354, -0.24838003, 0.16940664, 0.9650363, -0.14818724, -0.0802245, -1.52869811, 0.0414509 ]), 'l2': np.array([ -0.02403681, 0.03455527, 0.13470436, -0.21654892, 0.16637723, 0.83125941, -0.08555382, -0.12914753, -1.35294435, 0.02312935 ]), 'l1': np.array([ 0., 1.48439371e-02, 1.03806171e-01, -1.57313537e-01, 1.40448847e-01, 8.05306416e-01, -5.41296030e-02, -1.07753576e-01, -1.37612207e+00, 6.43289248e-05 ]), 'elasticnet': np.array([ 0., 0.01011823, 0.10530518, -0.16885214, 0.14373715, 0.82547312, -0.06122141, -0.09479487, -1.39454662, 0.00312597 ]), 'tv': np.array([ 0.03017556, 0.03714465, 0.0385349, -0.10169967, 0.15783755, 0.64860815, -0.00617636, -0.22235137, -1.07938977, -0.07181225 ]), 'binarsity': np.array([ 0.03794176, -0.04473702, 0.00339763, 0.00339763, -0.16493989, 0.05497996, 0.05497996, 0.05497996, -0.08457476, -0.08457476, 0.0294825, 0.13966702, 0.10251257, 0.02550264, -0.07207419, -0.05594102, -0.10018038, -0.10018038, 0.10018038, 0.10018038, -0.47859686, -0.06685181, -0.00850803, 0.55395669, 0.00556327, -0.00185442, -0.00185442, -0.00185442, 0.26010429, 0.09752455, -0.17881442, -0.17881442, 0.932516, 0.32095387, -0.49766315, -0.75580671, 0.0593833, -0.01433773, 0.01077109, -0.05581666 ]) } for penalty in self.penalties: if penalty == 'binarsity': # binarize features n_cuts = 3 binarizer = FeaturesBinarizer(n_cuts=n_cuts) features = binarizer.fit_transform(raw_features) else: features = raw_features for solver in self.solvers: solver_kwargs = { 'penalty': penalty, 'tol': 0, 'solver': solver, 'verbose': False, 'max_iter': 10 } if penalty != 'none': solver_kwargs['C'] = 50 if penalty == 'binarsity': solver_kwargs['blocks_start'] = \ binarizer.feature_indices[:-1, ] solver_kwargs['blocks_length'] = binarizer.n_values learner = CoxRegression(**solver_kwargs) learner.fit(features, times, censoring) np.testing.assert_array_almost_equal(coeffs_pen[penalty], learner.coeffs, decimal=1)
def _simulate(self): # The features matrix already exists, and is created by the # super class features = self.features n_samples, n_features = features.shape # Simulation of cut-points n_cut_points = self.n_cut_points n_cut_points_factor = self.n_cut_points_factor sparsity = self.sparsity s = round(n_features * sparsity) # sparsity index set S = np.random.choice(n_features, s, replace=False) if n_cut_points is None: n_cut_points = np.random.geometric(n_cut_points_factor, n_features) else: n_cut_points = np.repeat(n_cut_points, n_features) cut_points = {} coeffs_binarized = np.array([]) for j in range(n_features): feature_j = features[:, j] quantile_cuts = np.linspace(10, 90, 10) candidates = np.percentile(feature_j, quantile_cuts, interpolation="nearest") cut_points_j = np.random.choice(candidates, n_cut_points[j], replace=False) cut_points_j = np.sort(cut_points_j) cut_points_j = np.insert(cut_points_j, 0, -np.inf) cut_points_j = np.append(cut_points_j, np.inf) cut_points[str(j)] = cut_points_j # generate beta star if j in S: coeffs_block = np.zeros(n_cut_points[j] + 1) else: coeffs_block = np.random.normal(1, .5, n_cut_points[j] + 1) # make sure 2 consecutive coeffs are different enough coeffs_block = np.abs(coeffs_block) coeffs_block[::2] *= -1 # sum-to-zero constraint in each block coeffs_block = coeffs_block - coeffs_block.mean() coeffs_binarized = np.append(coeffs_binarized, coeffs_block) binarizer = FeaturesBinarizer(method='given', bins_boundaries=cut_points) binarized_features = binarizer.fit_transform(features) u = binarized_features.dot(coeffs_binarized) # Simulation of true times E = np.random.exponential(scale=1., size=n_samples) E *= np.exp(-u) scale = self.scale shape = self.shape if self.times_distribution == "weibull": T = 1. / scale * E**(1. / shape) else: # There is not point in this test, but let's do it like that # since we're likely to implement other distributions T = 1. / scale * E**(1. / shape) m = T.mean() # Simulation of the censoring c = self.censoring_factor C = np.random.exponential(scale=c * m, size=n_samples) # Observed time self._set("times", np.minimum(T, C).astype(self.dtype)) # Censoring indicator: 1 if it is a time of failure, 0 if censoring. censoring = (T <= C).astype(np.ushort) self._set("censoring", censoring) return self.features, self.times, self.censoring, cut_points, \ coeffs_binarized, S
def fit_and_score(features, features_bin, times, censoring, blocks_start, blocks_length, boundaries, features_names, idx_train, idx_test, validation_data, C): if features_names is None: features_names = [str(j) for j in range(features.shape[1])] X_train, X_test = features_bin[idx_train], features_bin[idx_test] Y_train, Y_test = times[idx_train], times[idx_test] delta_train, delta_test = censoring[idx_train], censoring[idx_test] learner = CoxRegression(penalty='binarsity', tol=1e-5, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True) learner._solver_obj.linesearch = False learner.C = C learner.fit(X_train, Y_train, delta_train) coeffs = learner.coeffs cut_points_estimates = {} for j, start in enumerate(blocks_start): coeffs_j = coeffs[start:start + blocks_length[j]] all_zeros = not np.any(coeffs_j) if all_zeros: cut_points_estimate_j = np.array([-np.inf, np.inf]) else: groups_j = get_groups(coeffs_j) jump_j = np.where(groups_j[1:] - groups_j[:-1] != 0)[0] + 1 if jump_j.size == 0: cut_points_estimate_j = np.array([-np.inf, np.inf]) else: cut_points_estimate_j = boundaries[features_names[j]][jump_j] if cut_points_estimate_j[0] != -np.inf: cut_points_estimate_j = np.insert(cut_points_estimate_j, 0, -np.inf) if cut_points_estimate_j[-1] != np.inf: cut_points_estimate_j = np.append(cut_points_estimate_j, np.inf) cut_points_estimates[features_names[j]] = cut_points_estimate_j binarizer = FeaturesBinarizer(method='given', bins_boundaries=cut_points_estimates) binarized_features = binarizer.fit_transform(features) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length X_bin_train = binarized_features[idx_train] X_bin_test = binarized_features[idx_test] learner_ = CoxRegression(penalty='binarsity', tol=1e-5, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True, C=1e10) learner_._solver_obj.linesearch = False learner_.fit(X_bin_train, Y_train, delta_train) score = learner_.score(X_bin_test, Y_test, delta_test) if validation_data is not None: X_validation = validation_data[0] X_bin_validation = binarizer.fit_transform(X_validation) Y_validation = validation_data[1] delta_validation = validation_data[2] score_validation = learner_.score(X_bin_validation, Y_validation, delta_validation) else: score_validation = None return score, score_validation
def binarsity_reg(X, y, grid_C=np.logspace(-2, 2, 10), C=None, verbose=True): """implementation of the linear regression on binarized features with binarsity penalty. Parameters: X : pandas dataframe, shape=(1420, 4), columns=[ str(j)+":continuous" for j in range(len(X[0]))], dtype='float' list of continuous features y :numpy.ndarray, dtype='float' list of labels grid_C : list or numpy.ndarray, dtype='float' this list of weigths associated to the binarsity penalty from which will be chosen the final by cross-validation if C=None C : float (positive) or None weigth associated to the binarsity penalty verbose: Boolean If True, prints additional info Returns : cut_points_estimates : dict, length = dimension of observation space final_coeffs : numpy.ndarray blocks_start : numpy.ndarray all_groups : list coeffs : numpy.ndarray regr.C : 'float' """ t0 = time.clock() if verbose == True: print("binarsity regression number of observations :", len(X)) n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts, detect_column_type="column_names") X_bin = binarizer.fit_transform(X) features_names = [X.columns[j] for j in range(len(X.columns))] boundaries = binarizer.bins_boundaries blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length n_folds = 5 if C == None: scores_cv = pd.DataFrame(columns=['C', 'scores_mean', 'scores_std']) for i, C_i in enumerate(grid_C): scores = compute_score(X, X_bin, y, blocks_start, blocks_length, C=C_i, n_folds=n_folds) scores = [C_i] + scores scores_cv.loc[i] = scores if verbose == True: print("cross_val scores :") print(scores_cv.round(3)) idx_min = scores_cv.scores_mean.argmin() C_best = grid_C[idx_min] idx_chosen = min([ i for i, j in enumerate( list(scores_cv.scores_mean <= scores_cv.scores_mean.min() + scores_cv.scores_std[idx_min])) if j ]) C_chosen = grid_C[idx_chosen] if verbose == True: print("C_best :", "%.4g" % C_best) print("C_chosen :", "%.4g" % C_chosen) regr = linear_model.LinearRegression(penalty='binarsity', blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True) if C == None: regr.C = C_chosen else: regr.C = C if verbose == True: print("regr.C :", "%.4g" % regr.C) regr.fit(X_bin, y) coeffs = regr.weights # computations of the cut-points all_groups = list() cut_points_estimates = {} for j, start in enumerate(blocks_start): coeffs_j = coeffs[start:start + blocks_length[j]] all_zeros = not np.any(coeffs_j) if all_zeros: cut_points_estimate_j = np.array([-np.inf, np.inf]) groups_j = np.array(blocks_length[j] * [0]) else: groups_j = get_groups(coeffs_j) # print("group"+str(j), groups_j) jump_j = np.where(groups_j[1:] - groups_j[:-1] != 0)[0] + 1 if jump_j.size == 0: cut_points_estimate_j = np.array([-np.inf, np.inf]) else: cut_points_estimate_j = boundaries[features_names[j]][jump_j] if cut_points_estimate_j[0] != -np.inf: cut_points_estimate_j = np.insert(cut_points_estimate_j, 0, -np.inf) if cut_points_estimate_j[-1] != np.inf: cut_points_estimate_j = np.append(cut_points_estimate_j, np.inf) cut_points_estimates[features_names[j]] = cut_points_estimate_j if j > 0: groups_j += max(all_groups) + 1 all_groups += list(groups_j) if verbose == True: print("cutpoints :") for j in range(len(cut_points_estimates)): print(features_names[j], [ "%.4f" % cut_points_estimates[features_names[j]][i] for i in range(len(cut_points_estimates[features_names[j]])) ]) # creation of final binarized X data for the computed cutpoints binarizer2 = FeaturesBinarizer(method='given', bins_boundaries=cut_points_estimates) X_bin2 = binarizer2.fit_transform(X) X_bin2 = np.array(X_bin2.todense()) blocks_start2 = binarizer2.blocks_start blocks_length2 = binarizer2.blocks_length X_bin2_train, X_bin2_test, y_train, y_test = train_test_split( X_bin2, y, test_size=0.2) # final re-fit: regr3 = linear_model.LinearRegression(penalty='binarsity', blocks_start=blocks_start2, blocks_length=blocks_length2, warm_start=True) regr3.C = 1e10 regr3.fit(X_bin2_train, y_train) if verbose == True: print( "R² score of final predictor on train data (80% of total data) :", "%.4g" % regr3.score(X_bin2_train, y_train)) print("R² score of final predictor on test data (20% of total data) :", "%.4g" % regr3.score(X_bin2_test, y_test)) final_coeffs = {"intercept": regr3.intercept, "weights": regr3.weights} t1 = time.clock() if verbose == True: print("time elapsed for binarsity regression step:", "%.4g" % (t1 - t0), "s") return cut_points_estimates, final_coeffs, blocks_start, all_groups, coeffs, regr.C
sparsity = .2 simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, n_cut_points=n_cut_points, seed=seed, verbose=False, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # binarize data n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length boundaries = binarizer.boundaries tic = time() solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length,