예제 #1
0
    def test_use_sample_weights(self):
        x, y = self.multinomial[1]
        class_0_idx = np.where(y == 0)
        to_drop = class_0_idx[0][:-3]
        to_keep = np.ones(len(y), dtype=bool)
        to_keep[to_drop] = False
        y = y[to_keep]
        x = x[to_keep, :]
        sample_weight = class_weight.compute_sample_weight('balanced', y)
        sample_weight[0] = 0.

        unweighted = LogitNet(random_state=2, scoring='f1_micro')
        unweighted = unweighted.fit(x, y)
        unweighted_acc = f1_score(y,
                                  unweighted.predict(x),
                                  sample_weight=sample_weight,
                                  average='micro')

        weighted = LogitNet(random_state=2, scoring='f1_micro')
        weighted = weighted.fit(x, y, sample_weight=sample_weight)
        weighted_acc = f1_score(y,
                                weighted.predict(x),
                                sample_weight=sample_weight,
                                average='micro')

        self.assertTrue(weighted_acc >= unweighted_acc)
예제 #2
0
 def test_one_row_predict(self):
     # Verify that predicting on one row gives only one row of output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)))
         assert p.shape == (1, )
예제 #3
0
 def test_random_state_cv(self):
     random_state = 133
     m = LogitNet(random_state=random_state)
     x, y = self.binomial[0]
     m.fit(x, y)
     print(dir(m._cv))
     assert m._cv.random_state == random_state
예제 #4
0
 def test_one_row_predict(self):
     # Verify that predicting on one row gives only one row of output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)))
         assert p.shape == (1,)
예제 #5
0
def train_glmnet(train,
                 test,
                 save_path_pred,
                 save_path_model,
                 save_path_json,
                 n_cores=5):
    ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores)
    # to sparse
    train_sparse = (csc_matrix(train[0]),
                    csc_matrix(train[1].astype(np.float64).reshape((-1, 1))))
    test_sparse = (csc_matrix(test[0]),
                   csc_matrix(test[1].astype(np.float64).reshape((-1, 1))))

    print("train the model")
    ln.fit(train_sparse[0], train[1])

    print("get predictions")
    y_pred = ln.predict_proba(test_sparse[0])[:, 1]
    auprc = cem.auprc(test[1], y_pred)
    auc = cem.auc(test[1], y_pred)

    # csv
    print("save csv")
    dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred})
    dt.to_csv(save_path_pred)

    # json
    print("save json")
    write_json({"auprc": auprc, "auc": auc}, save_path_json)
    # model
    print("save model")
    pickle.dump(ln, open(save_path_model, "wb"))
예제 #6
0
 def test_random_state_cv(self):
     random_state = 133
     m = LogitNet(random_state=random_state)
     x, y = self.binomial[0]
     m.fit(x, y)
     print(dir(m._cv))
     assert m._cv.random_state == random_state
def worker(
        boot_inds,
        X,
        y,
        X_noise=0.01,
        alpha=0.9,
        lambda_path=np.geomspace(1e-3, 1e-06, num=100),
):

    X_boot = X[boot_inds, :]
    y_boot = y[boot_inds]

    X_boot = scale(
        scale(X_boot +
              np.random.normal(scale=X_noise * 1e-6, size=X_boot.shape)) +
        np.random.normal(scale=X_noise, size=X_boot.shape))

    m = LogitNet(
        alpha=alpha,
        lambda_path=lambda_path,
        fit_intercept=False,
    )
    m.fit(X_boot, y_boot)

    lambdas_enet = m.lambda_path_
    coefs_enet = m.coef_path_.squeeze()

    return {
        "beta": coefs_enet != 0,
        "lambda_path": lambdas_enet,
    }
예제 #8
0
 def test_one_row_predict_proba(self):
     # Verify that predict_proba on one row gives 2D output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)))
         assert p.shape == (1, len(np.unique(y)))
예제 #9
0
 def test_one_row_predict_proba(self):
     # Verify that predict_proba on one row gives 2D output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)))
         assert p.shape == (1, len(np.unique(y)))
예제 #10
0
    def update(self, batch, batch_index):
        """Updates the inference state with a new batch and performs LFIRE.

        Parameters
        ----------
        batch: dict
        batch_index: int

        """
        # TODO: beautify this
        super(LFIRE, self).update(batch, batch_index)

        # Parse likelihood values
        likelihood = [
            batch[summary_name] for summary_name in self.summary_names
        ]
        likelihood = np.column_stack(likelihood)

        # Create training data
        X = np.vstack((likelihood, self.marginal))
        y = np.concatenate((np.ones(likelihood.shape[0]),
                            -1 * np.ones(self.marginal.shape[0])))

        # Logistic regression
        m = LogitNet(**self.logreg_config)
        m.fit(X, y)

        # Likelihood value
        log_likelihood_value = m.intercept_ + np.sum(
            np.multiply(m.coef_, self.observed))
        likelihood_value = np.exp(log_likelihood_value)

        # Joint prior value
        parameter_values = [
            batch[parameter_name] for parameter_name in self.parameter_names
        ]
        joint_prior_value = self.joint_prior.pdf(parameter_values)

        # Posterior value
        posterior_value = joint_prior_value * likelihood_value

        # Check if posterior value is non-finite
        if np.isinf(posterior_value):
            params = self.params_grid[batch_index]
            warnings.warn(
                f'Posterior value is not finite for parameters \
                          {self.parameter_names} = {params} and thus will be replaced with zero!',
                RuntimeWarning)
            posterior_value = 0
            for i, parameter_name in enumerate(self.parameter_names):
                self.state['infinity'][parameter_name] += [params[i]]

        # Update state dictionary
        self.state['posterior'][batch_index] = posterior_value
        self.state['lambda'][batch_index] = m.lambda_best_
        self.state['coef'][batch_index, :] = m.coef_
        self.state['intercept'][batch_index] = m.intercept_
        for parameter_name in self.parameter_names:
            self.state[parameter_name][batch_index] = batch[parameter_name]
예제 #11
0
 def test_one_row_predict_proba_with_lambda(self):
     # One row to predict_proba along with lambdas should give 3D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(np.unique(y)), len(lamb))
예제 #12
0
 def test_one_row_predict_proba_with_lambda(self):
     # One row to predict_proba along with lambdas should give 3D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(np.unique(y)), len(lamb))
예제 #13
0
    def test_single_class_exception(self):
        x, y = self.binomial[0]
        y = np.ones_like(y)
        m = LogitNet()

        with self.assertRaises(ValueError) as e:
            m.fit(x, y)

        self.assertEqual("Training data need to contain at least 2 classes.",
                         str(e.exception))
예제 #14
0
    def test_single_class_exception(self):
        x, y = self.binomial[0]
        y = np.ones_like(y)
        m = LogitNet()

        with self.assertRaises(ValueError) as e:
            m.fit(x, y)

        self.assertEqual("Training data need to contain at least 2 classes.",
                         str(e.exception))
예제 #15
0
    def test_cv_scoring_multinomial(self):
        x, y = self.multinomial[0]
        for method in self.scoring:
            m = LogitNet(scoring=method, random_state=488881)

            if method in self.multinomial_scoring:
                m = m.fit(x, y)
                check_accuracy(y, m.predict(x), 0.65, scoring=method)
            else:
                with self.assertRaises(ValueError):
                    m.fit(x, y)
예제 #16
0
 def test_n_splits(self):
     x, y = self.binomial[0]
     for n in self.n_splits:
         m = LogitNet(n_splits=n, random_state=46657)
         if n > 0 and n < 3:
             with self.assertRaisesRegexp(ValueError,
                                          "n_splits must be at least 3"):
                 m = m.fit(x, y)
         else:
             m = m.fit(x, y)
             sanity_check_logistic(m, x)
예제 #17
0
    def test_cv_scoring_multinomial(self):
        x, y = self.multinomial[0]
        for method in self.scoring:
            m = LogitNet(scoring=method, random_state=488881)

            if method in self.multinomial_scoring:
                m = m.fit(x, y)
                check_accuracy(y, m.predict(x), 0.65, scoring=method)
            else:
                with self.assertRaises(ValueError):
                    m.fit(x, y)
예제 #18
0
 def test_n_folds(self):
     x, y = self.binomial[0]
     for n in self.n_folds:
         m = LogitNet(n_folds=n, random_state=46657)
         if n > 0 and n < 3:
             with self.assertRaisesRegexp(ValueError,
                                          "n_folds must be at least 3"):
                 m = m.fit(x, y)
         else:
             m = m.fit(x, y)
             sanity_check_logistic(m, x)
예제 #19
0
 def test_max_features(self):
     max_features = 5
     m = LogitNet(random_state=1, max_features=max_features)
     x, y = self.multinomial[3]
     m = m.fit(x, y)
     num_features = np.count_nonzero(m.coef_, axis=1)
     self.assertTrue(np.all(num_features <= max_features))
예제 #20
0
    def likelihood(self, y_obs, y_sim):
        if not isinstance(y_obs, list):
            raise TypeError('Observed data is not of allowed types')

        if not isinstance(y_sim, list):
            raise TypeError('simulated data is not of allowed types')

        # Extract summary statistics from the observed data
        if (self.stat_obs is None or self.data_set != y_obs):
            self.stat_obs = self.statistics_calc.statistics(y_obs)
            self.data_set = y_obs

        # Extract summary statistics from the simulated data
        stat_sim = self.statistics_calc.statistics(y_sim)

        # Compute the approximate likelihood for the y_obs given theta
        y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate))
        X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0))
        m = LogitNet(alpha=1,
                     n_splits=self.n_folds,
                     max_iter=self.max_iter,
                     random_state=self.seed)
        m = m.fit(X, y)
        result = np.exp(-np.sum(
            (m.intercept_ +
             np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)),
            axis=0))

        return result
예제 #21
0
    def distance(self, d1, d2):
        """Calculates the distance between two datasets.

        Parameters
        ----------
        d1, d2: list
            A list, containing a list describing the data set
        """
        if not isinstance(d1, list):
            raise TypeError('Data is not of allowed types')
        if not isinstance(d2, list):
            raise TypeError('Data is not of allowed types')

        # Extract summary statistics from the dataset
        if (self.s1 is None or self.data_set != d1):
            self.s1 = self.statistics_calc.statistics(d1)
            self.data_set = d1
        s2 = self.statistics_calc.statistics(d2)

        # compute distnace between the statistics
        training_set_features = np.concatenate((self.s1, s2), axis=0)
        label_s1 = np.zeros(shape=(len(self.s1), 1))
        label_s2 = np.ones(shape=(len(s2), 1))
        training_set_labels = np.concatenate((label_s1, label_s2),
                                             axis=0).ravel()

        m = LogitNet(alpha=1, n_splits=10)
        m = m.fit(training_set_features, training_set_labels)
        distance = 2.0 * (m.cv_mean_score_[np.where(
            m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5)

        return distance
예제 #22
0
 def test_max_features(self):
     max_features = 5
     m = LogitNet(random_state=1, max_features=max_features)
     x, y = self.multinomial[3]
     m = m.fit(x, y)
     num_features = np.count_nonzero(m.coef_, axis=1)
     self.assertTrue(np.all(num_features <= max_features))
예제 #23
0
 def test_coef_limits(self):
     x, y = self.binomial[0]
     lower_limits = np.repeat(-1, x.shape[1])
     upper_limits = 0
     m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265, alpha=0)
     m = m.fit(x, y)
     assert(np.all(m.coef_ >= -1))
     assert(np.all(m.coef_ <= 0))
예제 #24
0
    def test_with_pandas_df(self):
        x, y = make_classification(random_state=1105)
        df = pd.DataFrame(x)
        df['y'] = y

        m = LogitNet(n_splits=3, random_state=123)
        m = m.fit(df.drop(['y'], axis=1), df.y)
        sanity_check_logistic(m, x)
예제 #25
0
    def test_with_pandas_df(self):
        x, y = make_classification(random_state=1105)
        df = pd.DataFrame(x)
        df['y'] = y

        m = LogitNet(n_folds=3, random_state=123)
        m = m.fit(df.drop(['y'], axis=1), df.y)
        sanity_check_logistic(m, x)
예제 #26
0
    def test_predict_without_cv(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=399001)
        m = m.fit(x, y)

        # should not make prediction unless value is passed for lambda
        with self.assertRaises(ValueError):
            m.predict(x)
예제 #27
0
    def test_predict_without_cv(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=399001)
        m = m.fit(x, y)

        # should not make prediction unless value is passed for lambda
        with self.assertRaises(ValueError):
            m.predict(x)
예제 #28
0
    def test_lambda_clip_warning(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=1729)
        m = m.fit(x, y)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[0] + 1)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[-1] - 1)
예제 #29
0
    def test_lambda_clip_warning(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=1729)
        m = m.fit(x, y)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[0] + 1)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[-1] - 1)
예제 #30
0
 def test_coef_limits(self):
     x, y = self.binomial[0]
     lower_limits = 0
     upper_limits = np.repeat(1, x.shape[1])
     m = LogitNet(lower_limits=lower_limits,
                  upper_limits=upper_limits,
                  random_state=69265)
     m = m.fit(x, y)
     assert (np.all(m.coef_) >= 0)
     assert (np.all(m.coef_) <= 1)
예제 #31
0
 def test_coef_limits(self):
     x, y = self.binomial[0]
     lower_limits = np.repeat(-1, x.shape[1])
     upper_limits = 0
     m = LogitNet(lower_limits=lower_limits,
                  upper_limits=upper_limits,
                  random_state=69265,
                  alpha=0)
     m = m.fit(x, y)
     assert (np.all(m.coef_ >= -1))
     assert (np.all(m.coef_ <= 0))
예제 #32
0
    def test_with_defaults(self):
        m = LogitNet(random_state=29341)
        for x, y in itertools.chain(self.binomial, self.multinomial):
            m = m.fit(x, y)
            sanity_check_logistic(m, x)

            # check selection of lambda_best
            assert m.lambda_best_inx_ <= m.lambda_max_inx_

            # check full path predict
            p = m.predict(x, lamb=m.lambda_path_)
            assert p.shape[-1] == m.lambda_path_.size
예제 #33
0
    def test_with_defaults(self):
        m = LogitNet(random_state=29341)
        for x, y in itertools.chain(self.binomial, self.multinomial):
            m = m.fit(x, y)
            sanity_check_logistic(m, x)

            # check selection of lambda_best
            ok_(m.lambda_best_inx_ <= m.lambda_max_inx_)

            # check full path predict
            p = m.predict(x, lamb=m.lambda_path_)
            eq_(p.shape[-1], m.lambda_path_.size)
예제 #34
0
    def test_relative_penalties(self):
        x, y = self.binomial[0]
        p = x.shape[1]

        # m1 no relative penalties applied
        m1 = LogitNet(alpha=1)
        m1.fit(x, y)

        # find the nonzero indices from LASSO
        nonzero = np.nonzero(m1.coef_[0])

        # unpenalize those nonzero coefs
        penalty = np.repeat(1, p)
        penalty[nonzero] = 0

        # refit the model with the unpenalized coefs
        m2 = LogitNet(alpha=1)
        m2.fit(x, y, relative_penalties=penalty)

        # verify that the unpenalized coef ests exceed the penalized ones
        # in absolute value
        assert(np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
예제 #35
0
    def test_use_sample_weights(self):
        x, y = self.multinomial[1]
        class_0_idx = np.where(y==0)
        to_drop = class_0_idx[0][:-3]
        to_keep = np.ones(len(y), dtype=bool)
        to_keep[to_drop] = False
        y = y[to_keep]
        x = x[to_keep, :]
        sample_weight = class_weight.compute_sample_weight('balanced', y)
        sample_weight[0] = 0.

        unweighted = LogitNet(random_state=2, scoring='f1_micro')
        unweighted = unweighted.fit(x, y)
        unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight,
                                  average='micro')

        weighted = LogitNet(random_state=2, scoring='f1_micro')
        weighted = weighted.fit(x, y, sample_weight=sample_weight)
        weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight,
                                average='micro')

        self.assertTrue(weighted_acc >= unweighted_acc)
예제 #36
0
    def test_relative_penalties(self):
        x, y = self.binomial[0]
        p = x.shape[1]

        # m1 no relative penalties applied
        m1 = LogitNet(alpha=1)
        m1.fit(x, y)

        # find the nonzero indices from LASSO
        nonzero = np.nonzero(m1.coef_[0])

        # unpenalize those nonzero coefs
        penalty = np.repeat(1, p)
        penalty[nonzero] = 0

        # refit the model with the unpenalized coefs
        m2 = LogitNet(alpha=1)
        m2.fit(x, y, relative_penalties=penalty)

        # verify that the unpenalized coef ests exceed the penalized ones
        # in absolute value
        assert (np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
예제 #37
0
def fit_glm(args, X, y):
    print('GLM')

    # fit on full dataset and save model
    np.random.seed(1000)
    glm = LogitNet(alpha=0.5, n_lambda=20, n_jobs=5)
    glm.fit(X, y)

    with open(MODEL_DIR / f'glm_{args.dataset}.pkl', 'wb') as f:
        pickle.dump(glm, f)

    print('In-sample: ')
    tmp = glm.predict_proba(X)
    AUC = roc_auc_score(y, tmp[:, 1])
    APR = average_precision_score(y, tmp[:, 1])
    print('\tAUC ', np.round(AUC, 4))
    print('\tAPR ', np.round(APR, 4))

    print('Out-of-sample: ')
    print(glm.lambda_best_)

    # generate in-dataset CV predictions
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111)
    np.random.seed(1000)
    glm = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_])
    cv_scores = cross_val_predict(glm,
                                  X,
                                  y,
                                  cv=kf,
                                  method='predict_proba',
                                  n_jobs=-1)
    AUC = roc_auc_score(y, cv_scores[:, 1])
    APR = average_precision_score(y, cv_scores[:, 1])
    print('\tAUC ', np.round(AUC, 4))
    print('\tAPR ', np.round(APR, 4))

    np.save(MODEL_DIR / f'glm_{args.dataset}.npy', cv_scores[:, 1])
예제 #38
0
    def distance(self, d1, d2):
        # Extract summary statistics from the dataset
        s1 = self.statistics_calc.statistics(d1)
        s2 = self.statistics_calc.statistics(d2)

        # compute distnace between the statistics
        training_set_features = np.concatenate((s1, s2), axis=0)
        label_s1 = np.zeros(shape=(len(s1), 1))
        label_s2 = np.ones(shape=(len(s2), 1))
        training_set_labels = np.concatenate((label_s1, label_s2),
                                             axis=0).ravel()

        m = LogitNet(alpha=1, n_splits=10)
        m = m.fit(training_set_features, training_set_labels)
        distance = 2.0 * (m.cv_mean_score_[np.where(
            m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5)

        return distance
예제 #39
0
    def test_coef_interpolation(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=561)
        m = m.fit(x, y)

        # predict for a value of lambda between two values on the computed path
        lamb_lo = m.lambda_path_[1]
        lamb_hi = m.lambda_path_[2]

        # a value not equal to one on the computed path
        lamb_mid = (lamb_lo + lamb_hi) / 2.0

        pred_lo = m.predict_proba(x, lamb=lamb_lo)
        pred_hi = m.predict_proba(x, lamb=lamb_hi)
        pred_mid = m.predict_proba(x, lamb=lamb_mid)

        self.assertFalse(np.allclose(pred_lo, pred_mid))
        self.assertFalse(np.allclose(pred_hi, pred_mid))
예제 #40
0
    def test_coef_interpolation(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=561)
        m = m.fit(x, y)

        # predict for a value of lambda between two values on the computed path
        lamb_lo = m.lambda_path_[1]
        lamb_hi = m.lambda_path_[2]

        # a value not equal to one on the computed path
        lamb_mid = (lamb_lo + lamb_hi) / 2.0

        pred_lo = m.predict_proba(x, lamb=lamb_lo)
        pred_hi = m.predict_proba(x, lamb=lamb_hi)
        pred_mid = m.predict_proba(x, lamb=lamb_mid)

        self.assertFalse(np.allclose(pred_lo, pred_mid))
        self.assertFalse(np.allclose(pred_hi, pred_mid))
예제 #41
0
    def distance(self, d1, d2):
        """Calculates the distance between two datasets.

        Parameters
        ----------
        d1: Python list
            Contains n1 data points.
        d2: Python list
            Contains n2 data points.

        Returns
        -------
        numpy.float
            The distance between the two input data sets.
        """
        s1, s2 = self._calculate_summary_stat(d1, d2)
        self.n_simulate = s1.shape[0]

        if not s2.shape[0] == self.n_simulate:
            raise RuntimeError(
                "The number of simulations in the two data sets should be the same in order for "
                "the classification accuracy implemented in PenLogReg to be a proper distance. Please "
                "check that `n_samples` in the `sample()` method for the sampler is equal to "
                "the number of datasets in the observations.")

        # compute distance between the statistics
        training_set_features = np.concatenate((s1, s2), axis=0)
        label_s1 = np.zeros(shape=(len(s1), 1))
        label_s2 = np.ones(shape=(len(s2), 1))
        training_set_labels = np.concatenate((label_s1, label_s2),
                                             axis=0).ravel()

        groups = np.repeat(np.arange(self.n_folds),
                           np.int(np.ceil(self.n_simulate / self.n_folds)))
        groups = groups[:self.n_simulate].tolist()
        groups += groups  # duplicate it as groups need to be defined for both datasets
        m = LogitNet(
            alpha=1,
            n_splits=self.n_folds)  # note we are not using random seed here!
        m = m.fit(training_set_features, training_set_labels, groups=groups)
        distance = 2.0 * (m.cv_mean_score_[np.where(
            m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5)

        return distance
예제 #42
0
파일: approx_lhd.py 프로젝트: JBris/abcpy
    def loglikelihood(self, y_obs, y_sim):
        if not isinstance(y_obs, list):
            raise TypeError('Observed data is not of allowed types')

        if not isinstance(y_sim, list):
            raise TypeError('simulated data is not of allowed types')

        # Check whether y_obs is same as the stored dataset.
        if self.data_set is not None:
            # check that the the observations have the same length; if not, they can't be the same:
            if len(y_obs) != len(self.data_set):
                self.dataSame = False
            elif len(np.array(y_obs[0]).reshape(-1, )) == 1:
                self.dataSame = self.data_set == y_obs
            else:  # otherwise it fails when y_obs[0] is array
                self.dataSame = all(
                    [(np.array(self.data_set[i]) == np.array(y_obs[i])).all() for i in range(len(y_obs))])

        if self.stat_obs is None or self.dataSame is False:
            self.stat_obs = self.statistics_calc.statistics(y_obs)
            self.data_set = y_obs

        # Extract summary statistics from the simulated data
        stat_sim = self.statistics_calc.statistics(y_sim)
        if not stat_sim.shape[0] == self.n_simulate:
            raise RuntimeError("The number of samples in the reference data set is not the same as the number of "
                               "samples in the generated data. Please check that `n_samples` in the `sample()` method"
                               "for the sampler is equal to `n_simulate` in PenLogReg.")

        # Compute the approximate likelihood for the y_obs given theta
        y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate))
        X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0))
        # define here groups for cross-validation:
        groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds)))
        groups = groups[:self.n_simulate].tolist()
        groups += groups  # duplicate it as groups need to be defined for both datasets
        m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed, scoring="log_loss")
        m = m.fit(X, y, groups=groups)
        result = -np.sum((m.intercept_ + np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)), axis=0)

        return result
예제 #43
0
def LFIRE(X, Y, n_m=n_m, n_theta=n_theta, random_state=random_state):
    """ LFIRE method as a distance node.
    
    Parameters
    ----------
    X: np.ndarray
        Simulated summary statistics
    Y: np.ndarray
        Observed summary statistics
    n_m: int
        Number of simulations from marginal
    n_theta: int
        Number of simulations from likelihood
    random_state: np.random
        Random state for random generators
        
    Output
    ------
    res: np.ndarray
        Negative value of the posterior function
    
    """
    
    random_state = random_state or np.random
    
    logreg = LogitNet(alpha=1, n_splits=10,
                      n_jobs=-1, verbose=0)
    
    # Global variables
    global global_params
    params = global_params
    
    global global_marginal
    marginal = global_marginal
    
    # Generate training data
    X_sim = sample_from_likelihood(params,
                                   batch_size=n_theta-1,
                                   random_state=random_state)
    
    X_ = np.concatenate((X, X_sim, marginal), axis=0)
    Y_ = np.concatenate((np.ones(n_theta), -1*np.ones(n_m)))
    
    # Fit the model
    logreg.fit(X_, Y_)
    
    # (Unnormalized) log posterior value
    res = logreg.intercept_ +  np.sum(np.multiply(logreg.coef_, Y))
    
    # Store results
    global global_param_arr
    global global_lambda_arr
    global global_intercept_arr
    global global_coef_arr
    
    global_param_arr.append([params['p_1'][0], params['p_2'][0], params['p_3'][0]])
    global_lambda_arr.append(logreg.lambda_best_[0])
    global_intercept_arr.append(logreg.intercept_)
    global_coef_arr.append(logreg.coef_.ravel())
    
    # Negative posterior value
    return np.atleast_2d(-0.5 * np.exp(res))
예제 #44
0
##############################################################################
# Replicating figure 1 - Done!
##############################################################################
# Create temporary containers
coefs = []

# Loop over number of iterations
for i in tqdm(range(N_ITERATIONS)):
    # Fit LogisticNet with the training set
    lr = LogitNet(alpha=ALPHA,
                  n_lambda=N_LAMBDA,
                  standardize=False,
                  cut_point=CUT_POINT,
                  max_iter=MAX_ITER)
    lr = lr.fit(X, y)

    # Extract and save coefficients
    coefs.append(list(lr.coef_[0]))

coefs = np.array(coefs)
survived = 1 * (abs(coefs) > 0)
survival_rate = np.sum(survived, axis=0) / float(N_ITERATIONS)
mask = 1 * (survival_rate > SURVIVAL_RATE_CUTOFF)
coefs_updated = coefs * mask
variable_names = ['Male'] + list(data.columns)
coefs_q025 = np.percentile(coefs_updated, q=2.5, axis=0)
coefs_mean = np.mean(coefs_updated, axis=0)
coefs_q975 = np.percentile(coefs_updated, q=97.5, axis=0)
betas = pd.DataFrame({'mean': coefs_mean})
betas['lb'] = coefs_q025
예제 #45
0
파일: classifier.py 프로젝트: elfi-dev/zoo
class LogisticRegression(Classifier):
    """Logistic regression classifier for ratio estimation."""
    def __init__(self, config=None, parallel_cv=True, class_min=0.005):
        """Initializes logistic regression classifier."""
        self.config = self._resolve_config(config, parallel_cv)
        self.model = LogitNet(**self.config)
        self.class_min = class_min
        self.parameter_names = ['lambda', 'intercept', 'coef']
        self.store = {}

    def fit(self, X, y, index=0):
        """Fits logistic regression classifier."""
        self.model.fit(X, y)
        # selected lambda:
        lambda_best = np.atleast_1d(self.model.lambda_best_)
        # fitted linear model parameters:
        p_0 = np.atleast_1d(self.model.intercept_)
        p_1 = np.squeeze(self.model.coef_)
        # store as array:
        self.store[index] = np.concatenate((lambda_best, p_0, p_1))

    def get(self, index=0):
        """Returns stored model parameters."""
        params = {}
        params['lambda'] = self.store[index][0]
        params['intercept'] = self.store[index][1]
        params['coef'] = self.store[index][2:]
        return params

    def set(self, params_dict, index=0):
        """Loads model."""
        params = [
            np.atleast_1d(params_dict[param]) for param in self.parameter_names
        ]
        self.store[index] = np.concatenate(params)

    def predict_log_likelihood_ratio(self, X, index=0):
        """Predicts the log-likelihood ratio."""
        params = self.store[index][1:]
        log_ratio = params[0] + np.sum(np.multiply(params[1:], X))
        return np.maximum(log_ratio,
                          np.log(self.class_min / (1 - self.class_min)))

    @property
    def parallel_cv(self):
        """Returns does classifier run cross-validation in parallel."""
        return self.config['n_jobs'] > 1

    def _get_default_config(self, parallel_cv):
        """Returns a default config for the logistic regression."""
        return {
            'alpha': 1,
            'n_splits': 10,
            'n_jobs': cpu_count() if parallel_cv else 1,
            'cut_point': 0
        }

    def _resolve_config(self, config, parallel_cv):
        """Resolves a config for logistic regression."""
        if not isinstance(config, dict):
            config = self._get_default_config(parallel_cv)
        return config
예제 #46
0
y_preds = []
y_tests = []
fpr_interp = np.linspace(0, 1, 100)
tpr_interps = []
aocs = []
for i in range(n_splits):
    print(i)
    model = LogitNet(
        fit_intercept=False,
        n_jobs=cpus)  #(cv=n_cv, n_jobs=min(cpus,n_cv), selection='random')
    X_train = person_biomarker[train_idxs[i], :]
    y_train = sample_data.phenotype[train_idxs[i]]
    X_test = person_biomarker[test_idxs[i], :]
    y_test = sample_data.phenotype[test_idxs[i]]
    model.fit(X_train.todense(), 1 * y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = sklearn.metrics.roc_curve(y_test, y_pred)
    interp_func = scipy.interpolate.interp1d(fpr, tpr)
    tpr_interp = interp_func(fpr_interp)
    tpr_interps = tpr_interps + [list(tpr_interp)]
    aoc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    aocs = aocs + [aoc]
tpr_df = pd.DataFrame(tpr_interps)
tpr_df.columns = fpr_interp
tpr_df = tpr_df.melt()
tpr_df.columns = ['fpr', 'tpr']
tpr_df.to_csv(BIOMARKER_DIR +
              'results/ML_phenotype_prediction/fpr_vs_tpr_%s_%s.tsv' %
              (biomarker, dataset),
              sep='\t')
예제 #47
0
 def test_alphas(self):
     x, y = self.binomial[0]
     for alpha in self.alphas:
         m = LogitNet(alpha=alpha, random_state=41041)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, alpha=alpha)
예제 #48
0
 def test_cv_scoring(self):
     x, y = self.binomial[0]
     for method in self.scoring:
         m = LogitNet(scoring=method, random_state=52633)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, scoring=method)