def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y == 0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1, )
def test_random_state_cv(self): random_state = 133 m = LogitNet(random_state=random_state) x, y = self.binomial[0] m.fit(x, y) print(dir(m._cv)) assert m._cv.random_state == random_state
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1,)
def train_glmnet(train, test, save_path_pred, save_path_model, save_path_json, n_cores=5): ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores) # to sparse train_sparse = (csc_matrix(train[0]), csc_matrix(train[1].astype(np.float64).reshape((-1, 1)))) test_sparse = (csc_matrix(test[0]), csc_matrix(test[1].astype(np.float64).reshape((-1, 1)))) print("train the model") ln.fit(train_sparse[0], train[1]) print("get predictions") y_pred = ln.predict_proba(test_sparse[0])[:, 1] auprc = cem.auprc(test[1], y_pred) auc = cem.auc(test[1], y_pred) # csv print("save csv") dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred}) dt.to_csv(save_path_pred) # json print("save json") write_json({"auprc": auprc, "auc": auc}, save_path_json) # model print("save model") pickle.dump(ln, open(save_path_model, "wb"))
def worker( boot_inds, X, y, X_noise=0.01, alpha=0.9, lambda_path=np.geomspace(1e-3, 1e-06, num=100), ): X_boot = X[boot_inds, :] y_boot = y[boot_inds] X_boot = scale( scale(X_boot + np.random.normal(scale=X_noise * 1e-6, size=X_boot.shape)) + np.random.normal(scale=X_noise, size=X_boot.shape)) m = LogitNet( alpha=alpha, lambda_path=lambda_path, fit_intercept=False, ) m.fit(X_boot, y_boot) lambdas_enet = m.lambda_path_ coefs_enet = m.coef_path_.squeeze() return { "beta": coefs_enet != 0, "lambda_path": lambdas_enet, }
def test_one_row_predict_proba(self): # Verify that predict_proba on one row gives 2D output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1))) assert p.shape == (1, len(np.unique(y)))
def update(self, batch, batch_index): """Updates the inference state with a new batch and performs LFIRE. Parameters ---------- batch: dict batch_index: int """ # TODO: beautify this super(LFIRE, self).update(batch, batch_index) # Parse likelihood values likelihood = [ batch[summary_name] for summary_name in self.summary_names ] likelihood = np.column_stack(likelihood) # Create training data X = np.vstack((likelihood, self.marginal)) y = np.concatenate((np.ones(likelihood.shape[0]), -1 * np.ones(self.marginal.shape[0]))) # Logistic regression m = LogitNet(**self.logreg_config) m.fit(X, y) # Likelihood value log_likelihood_value = m.intercept_ + np.sum( np.multiply(m.coef_, self.observed)) likelihood_value = np.exp(log_likelihood_value) # Joint prior value parameter_values = [ batch[parameter_name] for parameter_name in self.parameter_names ] joint_prior_value = self.joint_prior.pdf(parameter_values) # Posterior value posterior_value = joint_prior_value * likelihood_value # Check if posterior value is non-finite if np.isinf(posterior_value): params = self.params_grid[batch_index] warnings.warn( f'Posterior value is not finite for parameters \ {self.parameter_names} = {params} and thus will be replaced with zero!', RuntimeWarning) posterior_value = 0 for i, parameter_name in enumerate(self.parameter_names): self.state['infinity'][parameter_name] += [params[i]] # Update state dictionary self.state['posterior'][batch_index] = posterior_value self.state['lambda'][batch_index] = m.lambda_best_ self.state['coef'][batch_index, :] = m.coef_ self.state['intercept'][batch_index] = m.intercept_ for parameter_name in self.parameter_names: self.state[parameter_name][batch_index] = batch[parameter_name]
def test_one_row_predict_proba_with_lambda(self): # One row to predict_proba along with lambdas should give 3D output m = LogitNet(random_state=42) lamb = [0.01, 0.02, 0.04, 0.1] for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb) assert p.shape == (1, len(np.unique(y)), len(lamb))
def test_single_class_exception(self): x, y = self.binomial[0] y = np.ones_like(y) m = LogitNet() with self.assertRaises(ValueError) as e: m.fit(x, y) self.assertEqual("Training data need to contain at least 2 classes.", str(e.exception))
def test_cv_scoring_multinomial(self): x, y = self.multinomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=488881) if method in self.multinomial_scoring: m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.65, scoring=method) else: with self.assertRaises(ValueError): m.fit(x, y)
def test_n_splits(self): x, y = self.binomial[0] for n in self.n_splits: m = LogitNet(n_splits=n, random_state=46657) if n > 0 and n < 3: with self.assertRaisesRegexp(ValueError, "n_splits must be at least 3"): m = m.fit(x, y) else: m = m.fit(x, y) sanity_check_logistic(m, x)
def test_n_folds(self): x, y = self.binomial[0] for n in self.n_folds: m = LogitNet(n_folds=n, random_state=46657) if n > 0 and n < 3: with self.assertRaisesRegexp(ValueError, "n_folds must be at least 3"): m = m.fit(x, y) else: m = m.fit(x, y) sanity_check_logistic(m, x)
def test_max_features(self): max_features = 5 m = LogitNet(random_state=1, max_features=max_features) x, y = self.multinomial[3] m = m.fit(x, y) num_features = np.count_nonzero(m.coef_, axis=1) self.assertTrue(np.all(num_features <= max_features))
def likelihood(self, y_obs, y_sim): if not isinstance(y_obs, list): raise TypeError('Observed data is not of allowed types') if not isinstance(y_sim, list): raise TypeError('simulated data is not of allowed types') # Extract summary statistics from the observed data if (self.stat_obs is None or self.data_set != y_obs): self.stat_obs = self.statistics_calc.statistics(y_obs) self.data_set = y_obs # Extract summary statistics from the simulated data stat_sim = self.statistics_calc.statistics(y_sim) # Compute the approximate likelihood for the y_obs given theta y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate)) X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0)) m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed) m = m.fit(X, y) result = np.exp(-np.sum( (m.intercept_ + np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)), axis=0)) return result
def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1, d2: list A list, containing a list describing the data set """ if not isinstance(d1, list): raise TypeError('Data is not of allowed types') if not isinstance(d2, list): raise TypeError('Data is not of allowed types') # Extract summary statistics from the dataset if (self.s1 is None or self.data_set != d1): self.s1 = self.statistics_calc.statistics(d1) self.data_set = d1 s2 = self.statistics_calc.statistics(d2) # compute distnace between the statistics training_set_features = np.concatenate((self.s1, s2), axis=0) label_s1 = np.zeros(shape=(len(self.s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() m = LogitNet(alpha=1, n_splits=10) m = m.fit(training_set_features, training_set_labels) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = np.repeat(-1, x.shape[1]) upper_limits = 0 m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265, alpha=0) m = m.fit(x, y) assert(np.all(m.coef_ >= -1)) assert(np.all(m.coef_ <= 0))
def test_with_pandas_df(self): x, y = make_classification(random_state=1105) df = pd.DataFrame(x) df['y'] = y m = LogitNet(n_splits=3, random_state=123) m = m.fit(df.drop(['y'], axis=1), df.y) sanity_check_logistic(m, x)
def test_with_pandas_df(self): x, y = make_classification(random_state=1105) df = pd.DataFrame(x) df['y'] = y m = LogitNet(n_folds=3, random_state=123) m = m.fit(df.drop(['y'], axis=1), df.y) sanity_check_logistic(m, x)
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = 0 upper_limits = np.repeat(1, x.shape[1]) m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265) m = m.fit(x, y) assert (np.all(m.coef_) >= 0) assert (np.all(m.coef_) <= 1)
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = np.repeat(-1, x.shape[1]) upper_limits = 0 m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265, alpha=0) m = m.fit(x, y) assert (np.all(m.coef_ >= -1)) assert (np.all(m.coef_ <= 0))
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best assert m.lambda_best_inx_ <= m.lambda_max_inx_ # check full path predict p = m.predict(x, lamb=m.lambda_path_) assert p.shape[-1] == m.lambda_path_.size
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best ok_(m.lambda_best_inx_ <= m.lambda_max_inx_) # check full path predict p = m.predict(x, lamb=m.lambda_path_) eq_(p.shape[-1], m.lambda_path_.size)
def test_relative_penalties(self): x, y = self.binomial[0] p = x.shape[1] # m1 no relative penalties applied m1 = LogitNet(alpha=1) m1.fit(x, y) # find the nonzero indices from LASSO nonzero = np.nonzero(m1.coef_[0]) # unpenalize those nonzero coefs penalty = np.repeat(1, p) penalty[nonzero] = 0 # refit the model with the unpenalized coefs m2 = LogitNet(alpha=1) m2.fit(x, y, relative_penalties=penalty) # verify that the unpenalized coef ests exceed the penalized ones # in absolute value assert(np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y==0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_relative_penalties(self): x, y = self.binomial[0] p = x.shape[1] # m1 no relative penalties applied m1 = LogitNet(alpha=1) m1.fit(x, y) # find the nonzero indices from LASSO nonzero = np.nonzero(m1.coef_[0]) # unpenalize those nonzero coefs penalty = np.repeat(1, p) penalty[nonzero] = 0 # refit the model with the unpenalized coefs m2 = LogitNet(alpha=1) m2.fit(x, y, relative_penalties=penalty) # verify that the unpenalized coef ests exceed the penalized ones # in absolute value assert (np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
def fit_glm(args, X, y): print('GLM') # fit on full dataset and save model np.random.seed(1000) glm = LogitNet(alpha=0.5, n_lambda=20, n_jobs=5) glm.fit(X, y) with open(MODEL_DIR / f'glm_{args.dataset}.pkl', 'wb') as f: pickle.dump(glm, f) print('In-sample: ') tmp = glm.predict_proba(X) AUC = roc_auc_score(y, tmp[:, 1]) APR = average_precision_score(y, tmp[:, 1]) print('\tAUC ', np.round(AUC, 4)) print('\tAPR ', np.round(APR, 4)) print('Out-of-sample: ') print(glm.lambda_best_) # generate in-dataset CV predictions kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111) np.random.seed(1000) glm = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_]) cv_scores = cross_val_predict(glm, X, y, cv=kf, method='predict_proba', n_jobs=-1) AUC = roc_auc_score(y, cv_scores[:, 1]) APR = average_precision_score(y, cv_scores[:, 1]) print('\tAUC ', np.round(AUC, 4)) print('\tAPR ', np.round(APR, 4)) np.save(MODEL_DIR / f'glm_{args.dataset}.npy', cv_scores[:, 1])
def distance(self, d1, d2): # Extract summary statistics from the dataset s1 = self.statistics_calc.statistics(d1) s2 = self.statistics_calc.statistics(d2) # compute distnace between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() m = LogitNet(alpha=1, n_splits=10) m = m.fit(training_set_features, training_set_labels) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) self.n_simulate = s1.shape[0] if not s2.shape[0] == self.n_simulate: raise RuntimeError( "The number of simulations in the two data sets should be the same in order for " "the classification accuracy implemented in PenLogReg to be a proper distance. Please " "check that `n_samples` in the `sample()` method for the sampler is equal to " "the number of datasets in the observations.") # compute distance between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds))) groups = groups[:self.n_simulate].tolist() groups += groups # duplicate it as groups need to be defined for both datasets m = LogitNet( alpha=1, n_splits=self.n_folds) # note we are not using random seed here! m = m.fit(training_set_features, training_set_labels, groups=groups) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def loglikelihood(self, y_obs, y_sim): if not isinstance(y_obs, list): raise TypeError('Observed data is not of allowed types') if not isinstance(y_sim, list): raise TypeError('simulated data is not of allowed types') # Check whether y_obs is same as the stored dataset. if self.data_set is not None: # check that the the observations have the same length; if not, they can't be the same: if len(y_obs) != len(self.data_set): self.dataSame = False elif len(np.array(y_obs[0]).reshape(-1, )) == 1: self.dataSame = self.data_set == y_obs else: # otherwise it fails when y_obs[0] is array self.dataSame = all( [(np.array(self.data_set[i]) == np.array(y_obs[i])).all() for i in range(len(y_obs))]) if self.stat_obs is None or self.dataSame is False: self.stat_obs = self.statistics_calc.statistics(y_obs) self.data_set = y_obs # Extract summary statistics from the simulated data stat_sim = self.statistics_calc.statistics(y_sim) if not stat_sim.shape[0] == self.n_simulate: raise RuntimeError("The number of samples in the reference data set is not the same as the number of " "samples in the generated data. Please check that `n_samples` in the `sample()` method" "for the sampler is equal to `n_simulate` in PenLogReg.") # Compute the approximate likelihood for the y_obs given theta y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate)) X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0)) # define here groups for cross-validation: groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds))) groups = groups[:self.n_simulate].tolist() groups += groups # duplicate it as groups need to be defined for both datasets m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed, scoring="log_loss") m = m.fit(X, y, groups=groups) result = -np.sum((m.intercept_ + np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)), axis=0) return result
def LFIRE(X, Y, n_m=n_m, n_theta=n_theta, random_state=random_state): """ LFIRE method as a distance node. Parameters ---------- X: np.ndarray Simulated summary statistics Y: np.ndarray Observed summary statistics n_m: int Number of simulations from marginal n_theta: int Number of simulations from likelihood random_state: np.random Random state for random generators Output ------ res: np.ndarray Negative value of the posterior function """ random_state = random_state or np.random logreg = LogitNet(alpha=1, n_splits=10, n_jobs=-1, verbose=0) # Global variables global global_params params = global_params global global_marginal marginal = global_marginal # Generate training data X_sim = sample_from_likelihood(params, batch_size=n_theta-1, random_state=random_state) X_ = np.concatenate((X, X_sim, marginal), axis=0) Y_ = np.concatenate((np.ones(n_theta), -1*np.ones(n_m))) # Fit the model logreg.fit(X_, Y_) # (Unnormalized) log posterior value res = logreg.intercept_ + np.sum(np.multiply(logreg.coef_, Y)) # Store results global global_param_arr global global_lambda_arr global global_intercept_arr global global_coef_arr global_param_arr.append([params['p_1'][0], params['p_2'][0], params['p_3'][0]]) global_lambda_arr.append(logreg.lambda_best_[0]) global_intercept_arr.append(logreg.intercept_) global_coef_arr.append(logreg.coef_.ravel()) # Negative posterior value return np.atleast_2d(-0.5 * np.exp(res))
############################################################################## # Replicating figure 1 - Done! ############################################################################## # Create temporary containers coefs = [] # Loop over number of iterations for i in tqdm(range(N_ITERATIONS)): # Fit LogisticNet with the training set lr = LogitNet(alpha=ALPHA, n_lambda=N_LAMBDA, standardize=False, cut_point=CUT_POINT, max_iter=MAX_ITER) lr = lr.fit(X, y) # Extract and save coefficients coefs.append(list(lr.coef_[0])) coefs = np.array(coefs) survived = 1 * (abs(coefs) > 0) survival_rate = np.sum(survived, axis=0) / float(N_ITERATIONS) mask = 1 * (survival_rate > SURVIVAL_RATE_CUTOFF) coefs_updated = coefs * mask variable_names = ['Male'] + list(data.columns) coefs_q025 = np.percentile(coefs_updated, q=2.5, axis=0) coefs_mean = np.mean(coefs_updated, axis=0) coefs_q975 = np.percentile(coefs_updated, q=97.5, axis=0) betas = pd.DataFrame({'mean': coefs_mean}) betas['lb'] = coefs_q025
class LogisticRegression(Classifier): """Logistic regression classifier for ratio estimation.""" def __init__(self, config=None, parallel_cv=True, class_min=0.005): """Initializes logistic regression classifier.""" self.config = self._resolve_config(config, parallel_cv) self.model = LogitNet(**self.config) self.class_min = class_min self.parameter_names = ['lambda', 'intercept', 'coef'] self.store = {} def fit(self, X, y, index=0): """Fits logistic regression classifier.""" self.model.fit(X, y) # selected lambda: lambda_best = np.atleast_1d(self.model.lambda_best_) # fitted linear model parameters: p_0 = np.atleast_1d(self.model.intercept_) p_1 = np.squeeze(self.model.coef_) # store as array: self.store[index] = np.concatenate((lambda_best, p_0, p_1)) def get(self, index=0): """Returns stored model parameters.""" params = {} params['lambda'] = self.store[index][0] params['intercept'] = self.store[index][1] params['coef'] = self.store[index][2:] return params def set(self, params_dict, index=0): """Loads model.""" params = [ np.atleast_1d(params_dict[param]) for param in self.parameter_names ] self.store[index] = np.concatenate(params) def predict_log_likelihood_ratio(self, X, index=0): """Predicts the log-likelihood ratio.""" params = self.store[index][1:] log_ratio = params[0] + np.sum(np.multiply(params[1:], X)) return np.maximum(log_ratio, np.log(self.class_min / (1 - self.class_min))) @property def parallel_cv(self): """Returns does classifier run cross-validation in parallel.""" return self.config['n_jobs'] > 1 def _get_default_config(self, parallel_cv): """Returns a default config for the logistic regression.""" return { 'alpha': 1, 'n_splits': 10, 'n_jobs': cpu_count() if parallel_cv else 1, 'cut_point': 0 } def _resolve_config(self, config, parallel_cv): """Resolves a config for logistic regression.""" if not isinstance(config, dict): config = self._get_default_config(parallel_cv) return config
y_preds = [] y_tests = [] fpr_interp = np.linspace(0, 1, 100) tpr_interps = [] aocs = [] for i in range(n_splits): print(i) model = LogitNet( fit_intercept=False, n_jobs=cpus) #(cv=n_cv, n_jobs=min(cpus,n_cv), selection='random') X_train = person_biomarker[train_idxs[i], :] y_train = sample_data.phenotype[train_idxs[i]] X_test = person_biomarker[test_idxs[i], :] y_test = sample_data.phenotype[test_idxs[i]] model.fit(X_train.todense(), 1 * y_train) y_pred = model.predict_proba(X_test)[:, 1] fpr, tpr, _ = sklearn.metrics.roc_curve(y_test, y_pred) interp_func = scipy.interpolate.interp1d(fpr, tpr) tpr_interp = interp_func(fpr_interp) tpr_interps = tpr_interps + [list(tpr_interp)] aoc = sklearn.metrics.roc_auc_score(y_test, y_pred) aocs = aocs + [aoc] tpr_df = pd.DataFrame(tpr_interps) tpr_df.columns = fpr_interp tpr_df = tpr_df.melt() tpr_df.columns = ['fpr', 'tpr'] tpr_df.to_csv(BIOMARKER_DIR + 'results/ML_phenotype_prediction/fpr_vs_tpr_%s_%s.tsv' % (biomarker, dataset), sep='\t')
def test_alphas(self): x, y = self.binomial[0] for alpha in self.alphas: m = LogitNet(alpha=alpha, random_state=41041) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, alpha=alpha)
def test_cv_scoring(self): x, y = self.binomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=52633) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, scoring=method)