Exemplo n.º 1
def reconstruct_l2(f,tree,minfrac,maxfrac,alpha=1.0,suppress_warnings=True):
    tree_intervals = np.array([node.lbound for node in tree.leaves()])
    indices = np.digitize(f[0,:],tree_intervals) - 1
    fy = np.zeros([tree.size])
    fcts = np.zeros(np.shape(fy))
    for (i,idx) in enumerate(indices):
        fcts[idx] += 1.0
        n = fcts[idx]
        fy[idx] = ((n-1)/n)*fy[idx]+(1/n)*f[1,i]
    active_indices = np.where(fcts>0)[0]
    #print active_indices
    cl = tree.char_library(alpha)
    if suppress_warnings:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', UserWarning)
            alphas,active_vars,coef_path = sklm.lars_path(cl[active_indices,:],
        alphas,active_vars,coef_path = sklm.lars_path(cl[active_indices,:],

    return tree_intervals, fy, alphas, active_vars, coef_path, active_indices
Exemplo n.º 2
def test_lasso_lars_vs_lasso_cd(verbose=False):
    # Test that LassoLars and Lasso using coordinate descent give the
    # same results.
    X = 3 * diabetes.data

    alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
    for c, a in zip(lasso_path.T, alphas):
        if a == 0:
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)

    # similar test, with the classifiers
    for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
        clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y)
        clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8,
                                  normalize=False).fit(X, y)
        err = linalg.norm(clf1.coef_ - clf2.coef_)
        assert_less(err, 1e-3)

    # same test, with normalized data
    X = diabetes.data
    alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
    lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True,
    for c, a in zip(lasso_path.T, alphas):
        if a == 0:
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)
Exemplo n.º 3
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False):
    # Test that LassoLars and Lasso using coordinate descent give the
    # same results when early stopping is used.
    # (test : before, in the middle, and in the last part of the path)
    alphas_min = [10, 0.9, 1e-4]

    for alpha_min in alphas_min:
        alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
        lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
        lasso_cd.alpha = alphas[-1]
        lasso_cd.fit(X, y)
        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
        assert_less(error, 0.01)

    # same test, with normalization
    for alpha_min in alphas_min:
        alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
        lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True,
        lasso_cd.alpha = alphas[-1]
        lasso_cd.fit(X, y)
        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
        assert_less(error, 0.01)
Exemplo n.º 4
def test_lars_path_positive_constraint():
    # this is the main test for the positive parameter on the lars_path method
    # the estimator classes just make use of this function

    # we do the test on the diabetes dataset

    # ensure that we get negative coefficients when positive=False
    # and all positive when positive=True
    # for method 'lar' (default) and lasso

    # Once deprecation of LAR + positive option is done use these:
    # assert_raises(ValueError, linear_model.lars_path, diabetes['data'],
    #               diabetes['target'], method='lar', positive=True)

    with pytest.warns(DeprecationWarning, match="broken"):
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method='lar',

    method = 'lasso'
    alpha, active, coefs = \
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method=method,
    assert_true(coefs.min() < 0)

    alpha, active, coefs = \
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method=method,
    assert_true(coefs.min() >= 0)
Exemplo n.º 5
def test_no_path():
    # Test that the ``return_path=False`` option returns the correct output

    alphas_, active_, coef_path_ = linear_model.lars_path(diabetes.data, diabetes.target, method="lar")
    alpha_, active, coef = linear_model.lars_path(diabetes.data, diabetes.target, method="lar", return_path=False)

    assert_array_almost_equal(coef, coef_path_[:, -1])
    assert_true(alpha_ == alphas_[-1])
Exemplo n.º 6
def test_no_path_precomputed():
    # Test that the ``return_path=False`` option with Gram remains correct
    alphas_, _, coef_path_ = linear_model.lars_path(
        X, y, method='lar', Gram=G)
    alpha_, _, coef = linear_model.lars_path(
        X, y, method='lar', Gram=G, return_path=False)

    assert_array_almost_equal(coef, coef_path_[:, -1])
    assert alpha_ == alphas_[-1]
Exemplo n.º 7
def test_no_path():
    # Test that the ``return_path=False`` option returns the correct output
    alphas_, _, coef_path_ = linear_model.lars_path(
        X, y, method='lar')
    alpha_, _, coef = linear_model.lars_path(
        X, y, method='lar', return_path=False)

    assert_array_almost_equal(coef, coef_path_[:, -1])
    assert alpha_ == alphas_[-1]
Exemplo n.º 8
def test_all_precomputed():
    # Test that lars_path with precomputed Gram and Xy gives the right answer
    G = np.dot(X.T, X)
    Xy = np.dot(X.T, y)
    for method in 'lar', 'lasso':
        output = linear_model.lars_path(X, y, method=method)
        output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
        for expected, got in zip(output, output_pre):
            assert_array_almost_equal(expected, got)
Exemplo n.º 9
def test_no_path_precomputed():
    # Test that the ``return_path=False`` option with Gram remains correct

    G = np.dot(diabetes.data.T, diabetes.data)

    alphas_, active_, coef_path_ = linear_model.lars_path(diabetes.data, diabetes.target, method="lar", Gram=G)
    alpha_, active, coef = linear_model.lars_path(
        diabetes.data, diabetes.target, method="lar", Gram=G, return_path=False

    assert_array_almost_equal(coef, coef_path_[:, -1])
    assert_true(alpha_ == alphas_[-1])
Exemplo n.º 10
def test_all_precomputed():
    Test that lars_path with precomputed Gram and Xy gives the right answer
    X, y = diabetes.data, diabetes.target
    G = np.dot(X.T, X)
    Xy = np.dot(X.T, y)
    for method in "lar", "lasso":
        output = linear_model.lars_path(X, y, method=method)
        output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
        for expected, got in zip(output, output_pre):
            assert_array_almost_equal(expected, got)
Exemplo n.º 11
def test_no_path_all_precomputed():
    # Test that the ``return_path=False`` option with Gram and Xy remains
    # correct
    X, y = 3 * diabetes.data, diabetes.target
    G = np.dot(X.T, X)
    Xy = np.dot(X.T, y)

    alphas_, active_, coef_path_ = linear_model.lars_path(X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9)
    alpha_, active, coef = linear_model.lars_path(X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False)

    assert_array_almost_equal(coef, coef_path_[:, -1])
    assert_true(alpha_ == alphas_[-1])
Exemplo n.º 12
def test_lasso_lars_vs_lasso_cd_positive(verbose=False):
    # Test that LassoLars and Lasso using coordinate descent give the
    # same results when using the positive option

    # This test is basically a copy of the above with additional positive
    # option. However for the middle part, the comparison of coefficient values
    # for a range of alphas, we had to make an adaptations. See below.

    # not normalized data
    X = 3 * diabetes.data

    alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
    for c, a in zip(lasso_path.T, alphas):
        if a == 0:
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)

    # The range of alphas chosen for coefficient comparison here is restricted
    # as compared with the above test without the positive option. This is due
    # to the circumstance that the Lars-Lasso algorithm does not converge to
    # the least-squares-solution for small alphas, see 'Least Angle Regression'
    # by Efron et al 2004. The coefficients are typically in congruence up to
    # the smallest alpha reached by the Lars-Lasso algorithm and start to
    # diverge thereafter.  See
    # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff

    for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
        clf1 = linear_model.LassoLars(fit_intercept=False, alpha=alpha,
                                      normalize=False, positive=True).fit(X, y)
        clf2 = linear_model.Lasso(fit_intercept=False, alpha=alpha, tol=1e-8,
                                  normalize=False, positive=True).fit(X, y)
        err = linalg.norm(clf1.coef_ - clf2.coef_)
        assert_less(err, 1e-3)

    # normalized data
    X = diabetes.data
    alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
    lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True,
                                  tol=1e-8, positive=True)
    for c, a in zip(lasso_path.T[:-1], alphas[:-1]):  # don't include alpha=0
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)
Exemplo n.º 13
    def test_Lasso_Path(self):
        diabetes = datasets.load_diabetes()
        X = diabetes.data
        y = diabetes.target
        X /= X.std(axis=0)

        df = pdml.ModelFrame(diabetes)
        df.data /= df.data.std(axis=0, ddof=False)

        self.assert_numpy_array_almost_equal(df.data.values, X)

        eps = 5e-3
        expected = lm.lasso_path(X, y, eps, fit_intercept=False)
        result = df.lm.lasso_path(eps=eps, fit_intercept=False)
        self.assert_numpy_array_almost_equal(expected[0], result[0])
        self.assert_numpy_array_almost_equal(expected[1], result[1])
        self.assert_numpy_array_almost_equal(expected[2], result[2])

        expected = lm.enet_path(X, y, eps=eps, l1_ratio=0.8, fit_intercept=False)
        result = df.lm.enet_path(eps=eps, l1_ratio=0.8, fit_intercept=False)
        self.assert_numpy_array_almost_equal(expected[0], result[0])
        self.assert_numpy_array_almost_equal(expected[1], result[1])
        self.assert_numpy_array_almost_equal(expected[2], result[2])

        expected = lm.enet_path(X, y, eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False)
        result = df.lm.enet_path(eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False)
        self.assert_numpy_array_almost_equal(expected[0], result[0])
        self.assert_numpy_array_almost_equal(expected[1], result[1])
        self.assert_numpy_array_almost_equal(expected[2], result[2])

        expected = lm.lars_path(X, y, method='lasso', verbose=True)
        result = df.lm.lars_path(method='lasso', verbose=True)
        self.assert_numpy_array_almost_equal(expected[0], result[0])
        self.assert_numpy_array_almost_equal(expected[1], result[1])
        self.assert_numpy_array_almost_equal(expected[2], result[2])
Exemplo n.º 14
def estPath(values):
    """estimates path
       values: dict of x and y
       alphas: regularization lambdas
       coefs: coef matrix for features and alphas
    X,y = values["x"], values["y"]
    alphas, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)
    return alphas,coefs

    print alphas
    print coefs
    print coefs[:,1]
    xx = np.sum(np.abs(coefs.T), axis=1)
    xx /= xx[-1]

    plt.plot(xx, coefs.T)
    ymin, ymax = plt.ylim()
    plt.vlines(xx, ymin, ymax, linestyle='dashed')
    plt.xlabel('|coef| / max|coef|')
    plt.title('LASSO Path')
Exemplo n.º 15
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
    # Test lasso lars on a very ill-conditioned design, and check that
    # it does not blow up, and stays somewhat close to a solution given
    # by the coordinate descent solver
    # Also test that lasso_path (using lars_path output style) gives
    # the same result as lars_path and previous lasso output style
    # under these conditions.
    rng = np.random.RandomState(42)

    # Generate data
    n, m = 70, 100
    k = 5
    X = rng.randn(n, m)
    w = np.zeros((m, 1))
    i = np.arange(0, m)
    supp = i[:k]
    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
    y = np.dot(X, w)
    sigma = 0.2
    y += sigma * rng.rand(*y.shape)
    y = y.squeeze()
    lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method='lasso')

    _, lasso_coef2, _ = linear_model.lasso_path(X, y,

    assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
Exemplo n.º 16
def test_simple():
    # Principle of Lars is to keep covariances tied and decreasing

    # also test verbose output
    from io import StringIO
    import sys
    old_stdout = sys.stdout
        sys.stdout = StringIO()

        _, _, coef_path_ = linear_model.lars_path(
            X, y, method='lar', verbose=10)

        sys.stdout = old_stdout

        for i, coef_ in enumerate(coef_path_.T):
            res = y - np.dot(X, coef_)
            cov = np.dot(X.T, res)
            C = np.max(abs(cov))
            eps = 1e-3
            ocur = len(cov[C - eps < abs(cov)])
            if i < X.shape[1]:
                assert ocur == i + 1
                # no more than max_pred variables can go into the active set
                assert ocur == X.shape[1]
        sys.stdout = old_stdout
Exemplo n.º 17
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
    # Test lasso lars on a very ill-conditioned design, and check that
    # it does not blow up, and stays somewhat close to a solution given
    # by the coordinate descent solver
    rng = np.random.RandomState(42)

    # Generate data
    n, m = 80, 100
    k = 5
    X = rng.randn(n, m)
    w = np.zeros((m, 1))
    i = np.arange(0, m)
    supp = i[:k]
    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
    y = np.dot(X, w)
    sigma = 0.2
    y += sigma * rng.rand(*y.shape)
    y = y.squeeze()

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        lars_alphas, _, lars_coef = linear_model.lars_path(X, y,
    assert_true(len(warning_list) > 0)
    assert_true(('Dropping a regressor' in warning_list[0].message.args[0])
                or ('Early stopping' in warning_list[0].message.args[0]))

    lasso_coef = np.zeros((w.shape[0], len(lars_alphas)))
    for i, model in enumerate(linear_model.lasso_path(X, y, alphas=lars_alphas,
        lasso_coef[:, i] = model.coef_
    np.testing.assert_array_almost_equal(lars_coef, lasso_coef, decimal=1)
def test_lasso_path_return_models_vs_new_return_gives_same_coefficients():
    # Test that lasso_path with lars_path style output gives the
    # same result

    # Some toy data
    X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T
    y = np.array([1, 2, 3.1])
    alphas = [5., 1., .5]
    # Compute the lasso_path
    f = ignore_warnings
    coef_path = [e.coef_ for e in f(lasso_path)(X, y, alphas=alphas,

    # Use lars_path and lasso_path(new output) with 1D linear interpolation
    # to compute the the same path
    alphas_lars, _, coef_path_lars = lars_path(X, y, method='lasso')
    coef_path_cont_lars = interpolate.interp1d(alphas_lars[::-1],
                                               coef_path_lars[:, ::-1])
    alphas_lasso2, coef_path_lasso2, _ = lasso_path(X, y, alphas=alphas,
    coef_path_cont_lasso = interpolate.interp1d(alphas_lasso2[::-1],
                                                coef_path_lasso2[:, ::-1])

                                         np.asarray(coef_path).T, decimal=1)
Exemplo n.º 19
def test_simple():
    # Principle of Lars is to keep covariances tied and decreasing

    # also test verbose output
    from sklearn.externals.six.moves import cStringIO as StringIO
    import sys
    old_stdout = sys.stdout
        sys.stdout = StringIO()

        alphas_, active, coef_path_ = linear_model.lars_path(
            diabetes.data, diabetes.target, method="lar", verbose=10)

        sys.stdout = old_stdout

        for (i, coef_) in enumerate(coef_path_.T):
            res = y - np.dot(X, coef_)
            cov = np.dot(X.T, res)
            C = np.max(abs(cov))
            eps = 1e-3
            ocur = len(cov[C - eps < abs(cov)])
            if i < X.shape[1]:
                assert_true(ocur == i + 1)
                # no more than max_pred variables can go into the active set
                assert_true(ocur == X.shape[1])
        sys.stdout = old_stdout
Exemplo n.º 20
def test_singular_matrix():
    Test when input is a singular matrix
    X1 = np.array([[1, 1.], [1., 1.]])
    y1 = np.array([1, 1])
    alphas, active, coef_path = linear_model.lars_path(X1, y1)
    assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0], [1, 0]])
Exemplo n.º 21
def test_lars_path_gram_equivalent(method, return_path):
            Xy=Xy, Gram=G, n_samples=n_samples, method=method,
            X, y, Gram=G, method=method,
Exemplo n.º 22
def test_collinearity():
    """Check that lars_path is robust to collinearity in input"""
    X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]])
    y = np.array([1.0, 0.0, 0])

    _, _, coef_path_ = linear_model.lars_path(X, y, alpha_min=0.01)
    assert_true(not np.isnan(coef_path_).any())
    residual = np.dot(X, coef_path_[:, -1]) - y
    assert_less((residual ** 2).sum(), 1.0)  # just make sure it's bounded

    n_samples = 10
    X = np.random.rand(n_samples, 5)
    y = np.zeros(n_samples)
    _, _, coef_path_ = linear_model.lars_path(
        X, y, Gram="auto", copy_X=False, copy_Gram=False, alpha_min=0.0, method="lasso", verbose=0, max_iter=500
    assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
Exemplo n.º 23
def test_lasso_gives_lstsq_solution():
    Test that Lars Lasso gives least square solution at the end
    of the path
    alphas_, active, coef_path_ = linear_model.lars_path(X, y, method="lasso")
    coef_lstsq = np.linalg.lstsq(X, y)[0]
    assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
Exemplo n.º 24
    def regularization_path(self):

        n = len(self.C1)
        Sigma = np.linalg.pinv(-self.C1 + self.beta * np.eye(n))
        A = np.dot(self.Phi.view, np.eye(n) + np.dot(Sigma, -self.C2))
        b = np.dot(self.Phi.view, np.dot(Sigma, self.b))
        alphas, _, coefs = lm.lars_path(A, b, eps=1e-6)
        # lst = [(model.alpha, model.coef_) for model in models]
        return zip(alphas, coefs.T)
Exemplo n.º 25
def test_collinearity():
    """Check that lars_path is robust to collinearity in input"""
    X = np.array([[3., 3., 1.],
                  [2., 2., 0.],
                  [1., 1., 0]])
    y = np.array([1., 0., 0])

    _, _, coef_path_ = linear_model.lars_path(X, y)
    assert (not np.isnan(coef_path_).any())
    assert_array_almost_equal(np.dot(X, coef_path_[:, -1]), y)
Exemplo n.º 26
 def sets(self, x, y, n_keep):
     alphas, active, coef_path = linear_model.lars_path(x.values, y.values)
     sets = []
     seen = set()
     print coef_path
     for coefs in coef_path.T:
         cols = [x.columns[i] for i in range(len(coefs)) if coefs[i] > 1e-9]
         if len(cols) >= n_keep:
             return cols
     return cols
Exemplo n.º 27
def test_lars_path_positive_constraint():
    # this is the main test for the positive parameter on the lars_path method
    # the estimator classes just make use of this function

    # we do the test on the diabetes dataset

    # ensure that we get negative coefficients when positive=False
    # and all positive when positive=True
    # for method 'lar' (default) and lasso
    for method in ["lar", "lasso"]:
        alpha, active, coefs = linear_model.lars_path(
            diabetes["data"], diabetes["target"], return_path=True, method=method, positive=False
        assert_true(coefs.min() < 0)

        alpha, active, coefs = linear_model.lars_path(
            diabetes["data"], diabetes["target"], return_path=True, method=method, positive=True
        assert_true(coefs.min() >= 0)
Exemplo n.º 28
def test_collinearity():
    """Check that lars_path is robust to collinearity in input"""
    X = np.array([[3., 3., 1.],
                  [2., 2., 0.],
                  [1., 1., 0]])
    y = np.array([1., 0., 0])

    _, _, coef_path_ = linear_model.lars_path(X, y, alpha_min=0.01)
    assert_true(not np.isnan(coef_path_).any())
    residual = np.dot(X, coef_path_[:, -1]) - y
    assert_less((residual ** 2).sum(), 1.)  # just make sure it's bounded
Exemplo n.º 29
def regularized_model_features(X, y):
    alphas, _, coefs = lars_path(X.values, y, method='lasso', verbose=True)
    xx = np.sum(np.abs(coefs.T), axis=1)
    xx /= xx[-1]
    plt.plot(xx, coefs.T)
    ymin, ymax = plt.ylim()
    plt.xlabel('|coef| / max|coef|')
    plt.title('LASSO Path')
Exemplo n.º 30
def test_singular_matrix():
    # Test when input is a singular matrix
    # In this test the "drop for good strategy" of lars_path is necessary
    # to give a good answer
    X1 = np.array([[1, 1.], [1., 1.]])
    y1 = np.array([1, 1])
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        alphas, active, coef_path = linear_model.lars_path(X1, y1)
    assert_true(len(warning_list) > 0)
    assert_true('Dropping a regressor' in warning_list[0].message.args[0])

    assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
Exemplo n.º 31
def test_X_none_gram_not_none():
    with pytest.raises(ValueError,
                       match="X cannot be None if Gram is not None"):
        lars_path(X=None, y=[1], Gram='not None')
Exemplo n.º 32
def test_x_none_gram_none_raises_value_error():
    # Test that lars_path with no X and Gram raises exception
    Xy = np.dot(X.T, y)
    with pytest.raises(ValueError):
        linear_model.lars_path(None, y, Gram=None, Xy=Xy)
Exemplo n.º 33
def test_lasso_gives_lstsq_solution():
    # Test that Lars Lasso gives least square solution at the end
    # of the path
    _, _, coef_path_ = linear_model.lars_path(X, y, method="lasso")
    coef_lstsq = np.linalg.lstsq(X, y)[0]
    assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
Exemplo n.º 34
def test_lasso_lars_vs_lasso_cd_positive(verbose=False):
    # Test that LassoLars and Lasso using coordinate descent give the
    # same results when using the positive option

    # This test is basically a copy of the above with additional positive
    # option. However for the middle part, the comparison of coefficient values
    # for a range of alphas, we had to make an adaptations. See below.

    # not normalized data
    X = 3 * diabetes.data

    alphas, _, lasso_path = linear_model.lars_path(X,
    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
    for c, a in zip(lasso_path.T, alphas):
        if a == 0:
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)

    # The range of alphas chosen for coefficient comparison here is restricted
    # as compared with the above test without the positive option. This is due
    # to the circumstance that the Lars-Lasso algorithm does not converge to
    # the least-squares-solution for small alphas, see 'Least Angle Regression'
    # by Efron et al 2004. The coefficients are typically in congruence up to
    # the smallest alpha reached by the Lars-Lasso algorithm and start to
    # diverge thereafter.  See
    # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff

    for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
        clf1 = linear_model.LassoLars(fit_intercept=False,
                                      positive=True).fit(X, y)
        clf2 = linear_model.Lasso(fit_intercept=False,
                                  positive=True).fit(X, y)
        err = linalg.norm(clf1.coef_ - clf2.coef_)
        assert_less(err, 1e-3)

    # normalized data
    X = diabetes.data
    alphas, _, lasso_path = linear_model.lars_path(X,
    lasso_cd = linear_model.Lasso(fit_intercept=False,
    for c, a in zip(lasso_path.T[:-1], alphas[:-1]):  # don't include alpha=0
        lasso_cd.alpha = a
        lasso_cd.fit(X, y)
        error = linalg.norm(c - lasso_cd.coef_)
        assert_less(error, 0.01)
Exemplo n.º 35
def test_singular_matrix():
    # Test when input is a singular matrix
    X1 = np.array([[1, 1.], [1., 1.]])
    y1 = np.array([1, 1])
    alphas, active, coef_path = linear_model.lars_path(X1, y1)
    assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
Exemplo n.º 36
    def solve(self, fraction_evaluated, dim):
        eyAdj = self.linkfv(self.ey[:, dim]) - self.link.f(self.fnull[dim])
        s = np.sum(self.maskMatrix, 1)

        # do feature selection if we have not well enumerated the space
        nonzero_inds = np.arange(self.M)
        log.debug("fraction_evaluated = {0}".format(fraction_evaluated))
        # if self.l1_reg == "auto":
        #     warnings.warn(
        #         "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \
        #         "conditional use of AIC to simply \"num_features(10)\"!"
        #     )
        if (self.l1_reg not in [
                "auto", False, 0
        ]) or (fraction_evaluated < 0.2 and self.l1_reg == "auto"):
            w_aug = np.hstack(
                (self.kernelWeights * (self.M - s), self.kernelWeights * s))
            log.info("np.sum(w_aug) = {0}".format(np.sum(w_aug)))
            log.info("np.sum(self.kernelWeights) = {0}".format(
            w_sqrt_aug = np.sqrt(w_aug)
            eyAdj_aug = np.hstack(
                (eyAdj, eyAdj -
                 (self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))))
            eyAdj_aug *= w_sqrt_aug
            mask_aug = np.transpose(
                w_sqrt_aug *
                    (self.maskMatrix, self.maskMatrix - 1))))
            #var_norms = np.array([np.linalg.norm(mask_aug[:, i]) for i in range(mask_aug.shape[1])])

            # select a fixed number of top features
            if isinstance(self.l1_reg,
                          str) and self.l1_reg.startswith("num_features("):
                r = int(self.l1_reg[len("num_features("):-1])
                nonzero_inds = lars_path(mask_aug, eyAdj_aug, max_iter=r)[1]

            # use an adaptive regularization method
            elif self.l1_reg == "auto" or self.l1_reg == "bic" or self.l1_reg == "aic":
                c = "aic" if self.l1_reg == "auto" else self.l1_reg
                nonzero_inds = np.nonzero(
                    LassoLarsIC(criterion=c).fit(mask_aug, eyAdj_aug).coef_)[0]

            # use a fixed regularization coeffcient
                nonzero_inds = np.nonzero(
                    Lasso(alpha=self.l1_reg).fit(mask_aug, eyAdj_aug).coef_)[0]

        if len(nonzero_inds) == 0:
            return np.zeros(self.M), np.ones(self.M)

        # eliminate one variable with the constraint that all features sum to the output
        eyAdj2 = eyAdj - self.maskMatrix[:, nonzero_inds[-1]] * (
            self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))
        etmp = np.transpose(
            np.transpose(self.maskMatrix[:, nonzero_inds[:-1]]) -
            self.maskMatrix[:, nonzero_inds[-1]])
        log.debug("etmp[:4,:] {0}".format(etmp[:4, :]))

        # solve a weighted least squares equation to estimate phi
        tmp = np.transpose(
            np.transpose(etmp) * np.transpose(self.kernelWeights))
        tmp2 = np.linalg.inv(np.dot(np.transpose(tmp), etmp))
        w = np.dot(tmp2, np.dot(np.transpose(tmp), eyAdj2))
        log.debug("np.sum(w) = {0}".format(np.sum(w)))
        log.debug("self.link(self.fx) - self.link(self.fnull) = {0}".format(
            self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim])))
        log.debug("self.fx = {0}".format(self.fx[dim]))
        log.debug("self.link(self.fx) = {0}".format(self.link.f(self.fx[dim])))
        log.debug("self.fnull = {0}".format(self.fnull[dim]))
        log.debug("self.link(self.fnull) = {0}".format(
        phi = np.zeros(self.M)
        phi[nonzero_inds[:-1]] = w
        phi[nonzero_inds[-1]] = (self.link.f(self.fx[dim]) -
                                 self.link.f(self.fnull[dim])) - sum(w)
        log.info("phi = {0}".format(phi))

        # clean up any rounding errors
        for i in range(self.M):
            if np.abs(phi[i]) < 1e-10:
                phi[i] = 0

        return phi, np.ones(len(phi))
Exemplo n.º 37
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None):
    from sklearn import datasets, neighbors, linear_model, svm

    totalTime = 0

    startTrainTime = time()
    logger.info("Start training...")
    if model_type == 'ARDRegression':
        model = linear_model.ARDRegression().fit(train_x, train_y)
    elif model_type == 'BayesianRidge':
        model = linear_model.BayesianRidge().fit(train_x, train_y)
    elif model_type == 'ElasticNet':
        model = linear_model.ElasticNet().fit(train_x, train_y)
    elif model_type == 'ElasticNetCV':
        model = linear_model.ElasticNetCV().fit(train_x, train_y)
    elif model_type == 'HuberRegressor':
        model = linear_model.HuberRegressor().fit(train_x, train_y)
    elif model_type == 'Lars':
        model = linear_model.Lars().fit(train_x, train_y)
    elif model_type == 'LarsCV':
        model = linear_model.LarsCV().fit(train_x, train_y)
    elif model_type == 'Lasso':
        model = linear_model.Lasso().fit(train_x, train_y)
    elif model_type == 'LassoCV':
        model = linear_model.LassoCV().fit(train_x, train_y)
    elif model_type == 'LassoLars':
        model = linear_model.LassoLars().fit(train_x, train_y)
    elif model_type == 'LassoLarsCV':
        model = linear_model.LassoLarsCV().fit(train_x, train_y)
    elif model_type == 'LassoLarsIC':
        model = linear_model.LassoLarsIC().fit(train_x, train_y)
    elif model_type == 'LinearRegression':
        model = linear_model.LinearRegression().fit(train_x, train_y)
    elif model_type == 'LogisticRegression':
        model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'LogisticRegressionCV':
        model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'MultiTaskLasso':
        model = linear_model.MultiTaskLasso().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNet':
        model = linear_model.MultiTaskElasticNet().fit(train_x, train_y)
    elif model_type == 'MultiTaskLassoCV':
        model = linear_model.MultiTaskLassoCV().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNetCV':
        model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuit':
        model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuitCV':
        model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveClassifier':
        model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveRegressor':
        model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y)
    elif model_type == 'Perceptron':
        model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RandomizedLasso':
        model = linear_model.RandomizedLasso().fit(train_x, train_y)
    elif model_type == 'RandomizedLogisticRegression':
        model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y)
    elif model_type == 'RANSACRegressor':
        model = linear_model.RANSACRegressor().fit(train_x, train_y)
    elif model_type == 'Ridge':
        model = linear_model.Ridge().fit(train_x, train_y)
    elif model_type == 'RidgeClassifier':
        model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeClassifierCV':
        model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeCV':
        model = linear_model.RidgeCV().fit(train_x, train_y)
    elif model_type == 'SGDClassifier':
        model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SGDRegressor':
        model = linear_model.SGDRegressor().fit(train_x, train_y)
    elif model_type == 'TheilSenRegressor':
        model = linear_model.TheilSenRegressor().fit(train_x, train_y)
    elif model_type == 'lars_path':
        model = linear_model.lars_path().fit(train_x, train_y)
    elif model_type == 'lasso_path':
        model = linear_model.lasso_path().fit(train_x, train_y)
    elif model_type == 'lasso_stability_path':
        model = linear_model.lasso_stability_path().fit(train_x, train_y)
    elif model_type == 'logistic_regression_path':
        model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'orthogonal_mp':
        model = linear_model.orthogonal_mp().fit(train_x, train_y)
    elif model_type == 'orthogonal_mp_gram':
        model = linear_model.orthogonal_mp_gram().fit(train_x, train_y)
    elif model_type == 'LinearSVC':
        model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SVC':
        model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y)
        raise NotImplementedError('Model not implemented')

    logger.info("Finished training.")
    endTrainTime = time()
    trainTime = endTrainTime - startTrainTime
    logger.info("Training time : %d seconds" % trainTime)

    logger.info("Start predicting train set...")
    train_pred_y = model.predict(train_x)
    logger.info("Finished predicting train set.")
    logger.info("Start predicting test set...")
    test_pred_y = model.predict(test_x)
    logger.info("Finished predicting test set.")
    endTestTime = time()
    testTime = endTestTime - endTrainTime
    logger.info("Testing time : %d seconds" % testTime)
    totalTime += trainTime + testTime

    train_pred_y = np.round(train_pred_y)
    test_pred_y = np.round(test_pred_y)

    np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i')

    logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y)))
    logger.info('[TEST]  Acc: %.3f' % (accuracy_score(test_y, test_pred_y)))

    return accuracy_score(test_y, test_pred_y)
Exemplo n.º 38
def compute_bench(samples_range, features_range):

    it = 0

    results = dict()
    lars = np.empty((len(features_range), len(samples_range)))
    lars_gram = lars.copy()
    omp = lars.copy()
    omp_gram = lars.copy()

    max_it = len(samples_range) * len(features_range)
    for i_s, n_samples in enumerate(samples_range):
        for i_f, n_features in enumerate(features_range):
            it += 1
            n_informative = n_features / 10
            print('Iteration %03d of %03d' % (it, max_it))
            # dataset_kwargs = {
            #     'n_train_samples': n_samples,
            #     'n_test_samples': 2,
            #     'n_features': n_features,
            #     'n_informative': n_informative,
            #     'effective_rank': min(n_samples, n_features) / 10,
            #     #'effective_rank': None,
            #     'bias': 0.0,
            # }
            dataset_kwargs = {
                'n_samples': 1,
                'n_components': n_features,
                'n_features': n_samples,
                'n_nonzero_coefs': n_informative,
                'random_state': 0
            print("n_samples: %d" % n_samples)
            print("n_features: %d" % n_features)
            y, X, _ = make_sparse_coded_signal(**dataset_kwargs)
            X = np.asfortranarray(X)

            print("benchmarking lars_path (with Gram):", end='')
            tstart = time()
            G = np.dot(X.T, X)  # precomputed Gram matrix
            Xy = np.dot(X.T, y)
            lars_path(X, y, Xy=Xy, Gram=G, max_iter=n_informative)
            delta = time() - tstart
            print("%0.3fs" % delta)
            lars_gram[i_f, i_s] = delta

            print("benchmarking lars_path (without Gram):", end='')
            tstart = time()
            lars_path(X, y, Gram=None, max_iter=n_informative)
            delta = time() - tstart
            print("%0.3fs" % delta)
            lars[i_f, i_s] = delta

            print("benchmarking orthogonal_mp (with Gram):", end='')
            tstart = time()
            orthogonal_mp(X, y, precompute=True,
            delta = time() - tstart
            print("%0.3fs" % delta)
            omp_gram[i_f, i_s] = delta

            print("benchmarking orthogonal_mp (without Gram):", end='')
            tstart = time()
            orthogonal_mp(X, y, precompute=False,
            delta = time() - tstart
            print("%0.3fs" % delta)
            omp[i_f, i_s] = delta

    results['time(LARS) / time(OMP)\n (w/ Gram)'] = (lars_gram / omp_gram)
    results['time(LARS) / time(OMP)\n (w/o Gram)'] = (lars / omp)
    return results
Exemplo n.º 39
#meresultc = memodelc.fit()

db = pd.read_csv("2015_utilization_reduced.csv.gz")

# Merge provider type
du = db.groupby("npi")["provider_type"].agg("first")
du = pd.DataFrame(du).reset_index()
dr = pd.merge(dr, du, left_on="npi", right_on="npi")

# Use lars to consider provider effects
y, x = patsy.dmatrices("log_op_z ~ 0 + log_nonop_z + C(provider_type)", data=dr,
xa = np.asarray(x)
ya = np.asarray(y)[:,0]
xnames = x.columns.tolist()
alphas, active, coefs = linear_model.lars_path(xa, ya, method='lars', verbose=True)

# Display the first few variables selected by lars and the fitted
# correlation.
for k in range(1, 20):
    f = np.dot(xa, coefs[:, k])
    print(np.corrcoef(ya, f)[0, 1])

# Fixed effects model for provider type
pfemodelz = sm.OLS.from_formula("log_op_z ~ log_nonop_z + C(provider_type)",
pferesultz = pfemodelz.fit()

# Basic mixed model (random intercepts by provider)
pmemodelz = sm.MixedLM.from_formula("log_op_z ~ log_nonop_z", groups="provider_type", data=dr)
Exemplo n.º 40
def compute_bench(samples_range, features_range):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('Iteration %03d of %03d' % (it, max_it))
            dataset_kwargs = {
                'n_samples': n_samples,
                'n_features': n_features,
                'n_informative': n_features / 10,
                'effective_rank': min(n_samples, n_features) / 10,
                #'effective_rank': None,
                'bias': 0.0,
            print("n_samples: %d" % n_samples)
            print("n_features: %d" % n_features)
            X, y = make_regression(**dataset_kwargs)

            print("benchmarking lars_path (with Gram):", end='')
            tstart = time()
            G = np.dot(X.T, X)  # precomputed Gram matrix
            Xy = np.dot(X.T, y)
            lars_path(X, y, Xy=Xy, Gram=G, method='lasso')
            delta = time() - tstart
            print("%0.3fs" % delta)
            results['lars_path (with Gram)'].append(delta)

            print("benchmarking lars_path (without Gram):", end='')
            tstart = time()
            lars_path(X, y, method='lasso')
            delta = time() - tstart
            print("%0.3fs" % delta)
            results['lars_path (without Gram)'].append(delta)

            print("benchmarking lasso_path (with Gram):", end='')
            tstart = time()
            lasso_path(X, y, precompute=True)
            delta = time() - tstart
            print("%0.3fs" % delta)
            results['lasso_path (with Gram)'].append(delta)

            print("benchmarking lasso_path (without Gram):", end='')
            tstart = time()
            lasso_path(X, y, precompute=False)
            delta = time() - tstart
            print("%0.3fs" % delta)
            results['lasso_path (without Gram)'].append(delta)

    return results
Exemplo n.º 41
def graphical_lasso(TS, alpha=0.01, max_iter=100, convg_threshold=0.001):
    """ This function computes the graphical lasso algorithm as outlined in [1].
    TS (np.ndarray): Array consisting of $L$ observations from $N$ sensors.

    alpha (float, default=0.01): Coefficient of penalization, higher values
    means more sparseness

    convg_threshold (float, default=0.001): Stop the algorithm when the
    duality gap is below a certain threshold.

    cov (np.ndarray): Estimator of the inverse covariance matrix with sparsity.

    TS = TS.T

    if alpha < 1e-15:
        covariance_ = cov_estimator(TS)
        precision_ = np.linalg.pinv(TS)
        return covariance_, precision_
    n_features = TS.shape[1]

    mle_estimate_ = cov_estimator(TS)
    covariance_ = mle_estimate_.copy()
    precision_ = np.linalg.pinv(mle_estimate_)
    indices = np.arange(n_features)
    for i in range(max_iter):
        for n in range(n_features):
            sub_estimate = covariance_[indices != n].T[indices != n]
            row = mle_estimate_[n, indices != n]
            #solve the lasso problem
            _, _, coefs_ = lars_path(sub_estimate,
                                     alpha_min=alpha / (n_features - 1.),
            coefs_ = coefs_[:, -1]  #just the last please.
            #update the precision matrix.
                       n] = 1. / (covariance_[n, n] -
                                  np.dot(covariance_[indices != n, n], coefs_))
            precision_[indices != n, n] = -precision_[n, n] * coefs_
            precision_[n, indices != n] = -precision_[n, n] * coefs_
            temp_coefs = np.dot(sub_estimate, coefs_)
            covariance_[n, indices != n] = temp_coefs
            covariance_[indices != n, n] = temp_coefs

        #if test_convergence( old_estimate_, new_estimate_, mle_estimate_, convg_threshold):
        if np.abs(_dual_gap(mle_estimate_, precision_,
                            alpha)) < convg_threshold:
        #this triggers if not break command occurs
            "The algorithm did not converge. Try increasing the max number of iterations."

    return covariance_, precision_
Exemplo n.º 42
X = X1
Y = Y

# In[98]:

Y = dataset[:, 2]
X = dataset[:, :13]
X[np.isnan(X)] = 0

# In[99]:

print("Regularization path using lars_path")

# In[100]:

alphas1, active1, coefs1 = lars_path(X, Y, method='lasso', verbose=True)

# In[101]:

print("Regularization path using lars_path")

# In[102]:

eps = 5e-6

# In[103]:

alphas2, coefs2, _ = lasso_path(X, Y, eps)

# In[104]:
Exemplo n.º 43
# Linear Regression, Ridge or Poly
# Split the data into 3 portions: 60% for training, 20% for validation (used to select the model), 20% for final testing evaluation.

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=42)

# Features selection with LASSO
# Scale the variables
std = StandardScaler()
X_tr = std.transform(X_train.values)

# Finding the lars paths
print("Computing regularization path using the LARS ...")
alphas, _, coefs = lars_path(X_tr, y_train.values, method='lasso')

# plotting the LARS path
xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.title('LASSO Path')
Exemplo n.º 44
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

print("Computing regularization path using the LARS ...")
alphas, active, coefs = linear_model.lars_path(X,
print(np.add(active, 1))
xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.title('LASSO Path')
Exemplo n.º 45
print("Weights", clf.coef_)
#plt.plot(test_range,[f(x) for x in test_range], label="True plot")
#plt.plot(sample_range,data[1],linestyle=" ", marker="o")
#plt.plot(test_range,[y(x) for x in test_range], label="True plot")

print("intercept:", clf.intercept_)

#Calculation of design matrix
dm2 = np.array([[sample_range[i]**j for j in range(m + 1)] for i in range(n)],

eps = 5e-3
alphas_lasso, _, coefs = linear_model.lars_path(dm2,
xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]
neg_log_alphas_lasso = -np.log10(alphas_lasso)
for i in range(m):
    plt.plot(xx, coefs[i], label="w" + str(i))

legend = plt.legend(loc='upper center', shadow=True, fontsize='x-large')
#plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / |max_likelihood(coef)|')
Exemplo n.º 46
# Author: Fabian Pedregosa <*****@*****.**>
#         Alexandre Gramfort <*****@*****.**>
# License: BSD 3 clause
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

print("Computing regularization path using the LARS ...")
alphas, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef| (Alpha)')
plt.title('LASSO Path')

# ### Ventajas LASSO: