示例#1
0
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.5
        noisy_S = sampler(scale=scale)
        loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
        problem = rr.simple_problem(loss, pen)
        soln = problem.solve(max_its=100, tol=1.e-10)
        success += soln != 0
        return set(np.nonzero(success)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 20,
                                    'sizes': [100] * 5,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(XTX, XTXi, dispersion, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        pval = ndist.cdf(solnZ)
        pval = 2 * np.minimum(pval, 1 - pval)
        return set(BHfilter(pval, q=0.2))

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi,
                                            dispersion, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 5,
                                    'sizes': [200] * 10,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
示例#3
0
def simulate(n=200, p=50, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, dispersion, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p, ), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.5
        noisy_S = sampler(scale=scale)
        soln = XTXi.dot(noisy_S)
        solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion))
        return set(np.nonzero(np.fabs(solnZ) > 2.1)[0])

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi,
                                            dispersion, lam)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(5, 7),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 30,
                                    'sizes': [100, 100],
                                    'activation': 'relu'
                                })
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000):

    # description of statistical problem

    np.random.seed(seed)
    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False,
                                    center=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        n, p = X.shape

        rho = 0.8
        S = sampler(scale=0.) # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X
        Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape)

        X_full = np.hstack([X, Xnew])
        beta_full = np.linalg.pinv(X_full).dot(ynew)
        winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:]
        return set(np.nonzero(winners)[0])

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)


    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                smooth_sampler,
                                success_params=(8, 10),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
def simulate(n=200, p=100, s=10, signal=(1.5, 2), sigma=2, alpha=0.1, B=3000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={
                                    'epochs': 10,
                                    'sizes': [100] * 5,
                                    'dropout': 0.,
                                    'activation': 'relu'
                                })
示例#6
0
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=5000, seed=0):

    # description of statistical problem

    n, p = X_full.shape

    if boot_design:
        idx = np.random.choice(np.arange(n), n, replace=True)
        X = X_full[idx] # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout
        X += 0.1 * np.std(X) * np.random.standard_normal(X.shape) # to make non-degenerate
    else:
        X = X_full.copy()

    X = X - np.mean(X, 0)[None, :]
    X = X / np.std(X, 0)[None, :]

    n, p = X.shape
    truth = np.zeros(p)
    truth[:s] = np.linspace(signal[0], signal[1], s)
    np.random.shuffle(truth)
    truth /= np.sqrt(n)
    truth *= sigma

    y = X.dot(truth) + sigma * np.random.standard_normal(n)

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p,), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.
        noisy_S = sampler(scale=scale)
        loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
        problem = rr.simple_problem(loss, pen)
        soln = problem.solve(max_its=100, tol=1.e-10)
        success += soln != 0
        return set(np.nonzero(success)[0])

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              splitting_sampler,
                              success_params=(1, 1),
                              B=B,
                              fit_probability=keras_fit,
                              fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})

    if False: # df is not None:
        liu_df = liu_inference(X,
                               y,
                               lam,
                               dispersion,
                               truth,
                               alpha=alpha)

        return pd.merge(df, liu_df, on='variable')
    else:
        return df
示例#7
0
def simulate(n=1000, p=100, s=20, signal=(2, 4), sigma=2, alpha=0.1, B=2000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.1,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=True)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, sampler):

        min_success = 6
        ntries = 10

        def _alpha_grid(X, y, center, XTX):
            n, p = X.shape
            alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX)
            nselected = np.count_nonzero(coefs, axis=0)
            return alphas[nselected < np.sqrt(0.8 * p)]

        alpha_grid = _alpha_grid(X, y, sampler(scale=0.), XTX)
        success = np.zeros((p, alpha_grid.shape[0]))

        for _ in range(ntries):
            scale = 1.  # corresponds to sub-samples of 50%
            noisy_S = sampler(scale=scale)
            _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=XTX, alphas=alpha_grid)
            success += np.abs(np.sign(coefs))

        selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success)
        vars = set(np.nonzero(selected)[0])
        return vars

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)

    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi)

    # run selection algorithm


    return full_model_inference(X,
                                y,
                                truth,
                                selection_algorithm,
                                splitting_sampler,
                                success_params=(1, 1),
                                B=B,
                                fit_probability=keras_fit,
                                fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
示例#8
0
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000, seed=0):

    # description of statistical problem

    n, p = X_full.shape

    if boot_design:
        idx = np.random.choice(np.arange(n), n, replace=True)
        X = X_full[
            idx]  # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout
        X += 0.1 * np.std(X) * np.random.standard_normal(
            X.shape)  # to make non-degenerate
    else:
        X = X_full.copy()

    X = X - np.mean(X, 0)[None, :]
    X = X / np.std(X, 0)[None, :]

    n, p = X.shape
    truth = np.zeros(p)
    truth[:s] = np.linspace(signal[0], signal[1], s)
    np.random.shuffle(truth)
    truth /= np.sqrt(n)
    truth *= sigma

    y = X.dot(truth) + sigma * np.random.standard_normal(n)

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    print(dispersion, sigma**2)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        S = sampler(scale=0.5)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X
        G = lasso_glmnet(X, ynew, *[None] * 4)
        select = G.select(seed=seed)
        return set(list(select[0]))

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              splitting_sampler,
                              success_params=(6, 10),
                              B=B,
                              fit_probability=keras_fit,
                              fit_args={
                                  'epochs': 10,
                                  'sizes': [100] * 5,
                                  'dropout': 0.,
                                  'activation': 'relu'
                              })

    return df
def simulate(s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000, seed=0):

    # description of statistical problem

    n, p = X_full.shape

    if boot_design:
        idx = np.random.choice(np.arange(n), n, replace=True)
        X = X_full[
            idx]  # bootstrap X to make it really an IID sample, i.e. don't condition on X throughout
        X += 0.1 * np.std(X) * np.random.standard_normal(
            X.shape)  # to make non-degenerate
    else:
        X = X_full.copy()

    X = X - np.mean(X, 0)[None, :]
    X = X / np.std(X, 0)[None, :]

    n, p = X.shape
    truth = np.zeros(p)
    truth[:s] = np.linspace(signal[0], signal[1], s)
    np.random.shuffle(truth)
    truth /= np.sqrt(n)
    truth *= sigma

    y = X.dot(truth) + sigma * np.random.standard_normal(n)

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    print(dispersion, sigma**2)
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, sampler):

        min_success = 6
        ntries = 10

        def _alpha_grid(X, y, center, XTX):
            n, p = X.shape
            alphas, coefs, _ = lasso_path(X.copy(),
                                          y.copy(),
                                          Xy=center.copy(),
                                          precompute=XTX.copy())
            nselected = np.count_nonzero(coefs, axis=0)
            alphas = alphas[nselected < 20]
            return alphas

        alpha_grid = _alpha_grid(X, y, sampler.center, XTX)
        success = np.zeros((p, alpha_grid.shape[0]))

        for _ in range(ntries):
            scale = 1.  # corresponds to sub-samples of 50%
            noisy_S = sampler(scale=scale)
            _, coefs, _ = lasso_path(X,
                                     y,
                                     Xy=noisy_S,
                                     precompute=XTX,
                                     alphas=alpha_grid)
            success += np.abs(np.sign(coefs))

        selected = np.apply_along_axis(
            lambda row: any(x > min_success for x in row), 1, success)
        vars = set(np.nonzero(selected)[0])
        return vars

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              splitting_sampler,
                              success_params=(6, 10),
                              B=B,
                              fit_probability=keras_fit,
                              fit_args={
                                  'epochs': 10,
                                  'sizes': [100] * 5,
                                  'dropout': 0.,
                                  'activation': 'relu'
                              })

    return df
示例#10
0
def simulate(n=400,
             p=100,
             s=10,
             signal=(0.5, 1),
             sigma=2,
             alpha=0.1,
             seed=0,
             B=2000):

    # description of statistical problem

    np.random.seed(seed)
    X, y, truth = gaussian_instance(n=n,
                                    p=p,
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5,
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False,
                                    center=False)[:3]

    dispersion = sigma**2

    S = X.T.dot(y)
    covS = dispersion * X.T.dot(X)
    smooth_sampler = normal_sampler(S, covS)

    def meta_algorithm(X, XTXi, resid, sampler):

        n, p = X.shape
        idx = np.random.choice(np.arange(n), 200, replace=False)

        S = sampler(scale=0.)  # deterministic with scale=0
        ynew = X.dot(XTXi).dot(S) + resid  # will be ok for n>p and non-degen X

        G = lasso_glmnet(X[idx], ynew[idx], *[None] * 4)
        select = G.select()
        return set(list(select[0]))

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n - p)

    selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              smooth_sampler,
                              success_params=(1, 1),
                              B=B,
                              fit_probability=keras_fit,
                              fit_args={
                                  'epochs': 20,
                                  'sizes': [100] * 5,
                                  'dropout': 0.,
                                  'activation': 'relu'
                              })

    if df is not None:

        observed_set = list(df['variable'])
        true_target = truth[observed_set]

        np.random.seed(seed)
        X2, _, _ = gaussian_instance(n=n,
                                     p=p,
                                     s=s,
                                     equicorrelated=False,
                                     rho=0.5,
                                     sigma=sigma,
                                     signal=signal,
                                     random_signs=True,
                                     center=False,
                                     scale=False)[:3]
        stage_1 = np.random.choice(np.arange(n), 200, replace=False)
        stage_2 = sorted(set(range(n)).difference(stage_1))
        X2 = X2[stage_2]
        y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0])

        XTXi_2 = np.linalg.inv(X2.T.dot(X2))
        resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2)))
        dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1])

        naive_df = naive_full_model_inference(X2,
                                              y2,
                                              dispersion_2,
                                              observed_set,
                                              alpha=alpha)

        df = pd.merge(df, naive_df, on='variable')
        return df
示例#11
0
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

                                fit_probability=keras_fit,
                                fit_args={'epochs':200},

    # run selection algorithm


    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              splitting_sampler,
                              success_params=(1, 1),
                              B=B,
                              fit_probability=logit_fit,
                              fit_args={'df':20})

    if df is not None:

        liu_df = liu_inference(X,
                               y,
                               lam,
                               dispersion,
                               truth,
                               alpha=alpha)

        return pd.merge(df, liu_df, on='variable')
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000):

    # description of statistical problem

    X, y, truth = gaussian_instance(n=n,
                                    p=p, 
                                    s=s,
                                    equicorrelated=False,
                                    rho=0.5, 
                                    sigma=sigma,
                                    signal=signal,
                                    random_signs=True,
                                    scale=False)[:3]

    XTX = X.T.dot(X)
    XTXi = np.linalg.inv(XTX)
    resid = y - X.dot(XTXi.dot(X.T.dot(y)))
    dispersion = np.linalg.norm(resid)**2 / (n-p)
                         
    S = X.T.dot(y)
    covS = dispersion * XTX
    splitting_sampler = split_sampler(X * y[:, None], covS)

    def meta_algorithm(XTX, XTXi, lam, sampler):

        p = XTX.shape[0]
        success = np.zeros(p)

        loss = rr.quadratic_loss((p,), Q=XTX)
        pen = rr.l1norm(p, lagrange=lam)

        scale = 0.
        noisy_S = sampler(scale=scale)
        loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)
        problem = rr.simple_problem(loss, pen)
        soln = problem.solve(max_its=100, tol=1.e-10)
        success += soln != 0
        return set(np.nonzero(success)[0])

    lam = 4. * np.sqrt(n)
    selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam)

    # run selection algorithm

    df = full_model_inference(X,
                              y,
                              truth,
                              selection_algorithm,
                              splitting_sampler,
                              success_params=(1, 1),
                              B=B,
                              fit_probability=random_forest_fit,
                              fit_args={'ntrees':5000})

    liu_df = liu_inference(X,
                           y,
                           lam,
                           dispersion,
                           truth,
                           alpha=alpha)

    return pd.merge(df, liu_df, on='variable')