def test_f_regression_select():
    print "==> a lot of features"
    X, y = make_regression(n_samples=20000, n_features=200, n_informative=150,
                             shuffle=False, random_state=0)
    idx_sel = f_regression_select(X, y, verbose=2)
    print "==> few ones"
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, noise=0.5,
                             shuffle=False, random_state=0)
    idx_sel = f_regression_select(X, y, verbose=1)
    print "tests ok"
예제 #2
0
def test_mbsgd_regressor_default(datatype, nrows,
                                 column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor()
    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()

    skl_sgd_regressor = SGDRegressor()
    skl_sgd_regressor.fit(X_train, y_train)
    skl_pred = skl_sgd_regressor.predict(X_test)

    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)
    skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
    try:
        assert abs(cu_r2 - skl_r2) <= 0.02
    except AssertionError:
        pytest.xfail("failed due to AssertionError error, "
                     "fix will be merged soon")
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, ncols):

    train_rows = int(nrows * 0.8)
    X, y = make_regression(n_samples=nrows, n_features=ncols, random_state=0)
    X_test = np.array(X[train_rows:, :], dtype=datatype)
    X_train = np.array(X[:train_rows, :], dtype=datatype)
    y_train = np.array(y[:train_rows, ], dtype=datatype)
    y_test = np.array(y[train_rows:, ], dtype=datatype)

    cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate,
                                           eta0=0.005,
                                           epochs=100,
                                           fit_intercept=True,
                                           batch_size=2,
                                           tol=0.0,
                                           penalty=penalty)

    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()

    skl_sgd_regressor = SGDRegressor(learning_rate=lrate,
                                     eta0=0.005,
                                     max_iter=100,
                                     fit_intercept=True,
                                     tol=0.0,
                                     penalty=penalty,
                                     random_state=0)

    skl_sgd_regressor.fit(X_train, y_train)
    skl_pred = skl_sgd_regressor.predict(X_test)

    cu_r2 = r2_score(cu_pred, y_test)
    skl_r2 = r2_score(skl_pred, y_test)
    assert (cu_r2 - skl_r2 <= 0.02)
예제 #4
0
def generate_datasets(n_train, n_test, n_features, noise=0.1, verbose=False):
    if verbose:
        print('Generating dataset ...')

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features,
                                 noise=noise,
                                 coef=True)
    random_seed = 0
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, test_size=n_test, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print('Ok')
    return X_train, y_train, X_test, y_test
예제 #5
0
def test_csr_sparse_center_data():
    # Test output format of sparse_center_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = sparse_center_data(csr, y, True)
    assert_equal(csr_.getformat(), 'csr')
예제 #6
0
def main(args):
    '''
  Main Function
  :param args: Dictionary from Parser
  :return: None
  '''
    dict_arg = vars(args)
    algo = dict_arg['algorithm']
    dark_mode = dict_arg['dark_mode']
    resolution = dict_arg['resolution']
    n_samples = dict_arg['n_samples']
    noise = dict_arg['noise']

    X, Y = make_regression(n_samples=n_samples, n_features=1, noise=noise)

    hash_function = {
        'decisiontree': DecisionTreeRegressor(),
        'adaboost': AdaBoostRegressor(),
        'randomforest': RandomForestRegressor(),
        'kneighbors': KNeighborsRegressor(),
        'extratrees': ExtraTreesRegressor(),
        'svr': SVR(kernel='linear'),
        'mlp': MLPRegressor()
    }

    reg = hash_function[algo]

    reg.fit(X, Y)

    X_reshaped = np.reshape(X, (len(X)))

    x_best, y_best = give_best_fit(X_reshaped, reg, reso=resolution)

    graph(X, Y, x_best, y_best, reg, dark_mode=dark_mode)
예제 #7
0
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features, noise=noise, coef=True)

    random_seed = 13
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
def test_select_percentile_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the percentile heuristic
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate
def test_mutual_info_regression():
    X, y = make_regression(n_samples=100,
                           n_features=10,
                           n_informative=2,
                           shuffle=False,
                           random_state=0,
                           noise=10)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_regression, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(mutual_info_regression,
                                   mode='k_best',
                                   param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_regression,
                                   mode='percentile',
                                   param=20).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
예제 #11
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert csr_.getformat() == 'csr'
예제 #12
0
def test_mbsgd_regressor(datatype, lrate, input_type, penalty,
                         nrows, column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005,
                                           epochs=100, fit_intercept=True,
                                           batch_size=2, tol=0.0,
                                           penalty=penalty)

    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()

    skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005,
                                     max_iter=100, fit_intercept=True,
                                     tol=0.0, penalty=penalty,
                                     random_state=0)

    skl_sgd_regressor.fit(X_train, y_train)
    skl_pred = skl_sgd_regressor.predict(X_test)

    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)
    skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
    assert abs(cu_r2 - skl_r2) <= 0.02
def create_regression():
    x, y = make_regression(
        n_samples=100,
        n_features=1,
        n_informative=1,
        random_state=0,
        noise=35
    )

    # learning rate
    alpha = 1
    # convergence criteria
    ep = 1e-12
    # max iterations
    max_iter = 20

    theta0, theta1, cost_f = gradient_descent(alpha, x, y, ep, max_iter)

    slope, intercept, r_value, p_value, slope_std_error = stats.linregress(x[:, 0], y)
    print ('intercept = %s slope = %s') % (intercept, slope)

    for i in range(x.shape[0]):
        y_predict = theta0 + theta1 * x

    pylab.plot(x, y, 'o')
    pylab.plot(x, y_predict, '-')
    pylab.show()
    print "Done."
예제 #14
0
def test_f_regression():
    # Test whether the F test yields meaningful results
    # on a simple simulated regression problem
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    F, pv = f_regression(X, y)
    assert_true((F > 0).all())
    assert_true((pv > 0).all())
    assert_true((pv < 1).all())
    assert_true((pv[:5] < 0.05).all())
    assert_true((pv[5:] > 1.e-4).all())

    # with centering, compare with sparse
    F, pv = f_regression(X, y, center=True)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)

    # again without centering, compare with sparse
    F, pv = f_regression(X, y, center=False)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
예제 #15
0
def test_invalid_percentile():
    X, y = make_regression(n_samples=10, n_features=20, n_informative=2, shuffle=False, random_state=0)

    assert_raises(ValueError, SelectPercentile(percentile=-1).fit, X, y)
    assert_raises(ValueError, SelectPercentile(percentile=101).fit, X, y)
    assert_raises(ValueError, GenericUnivariateSelect(mode="percentile", param=-1).fit, X, y)
    assert_raises(ValueError, GenericUnivariateSelect(mode="percentile", param=101).fit, X, y)
예제 #16
0
def test_mutual_info_regression():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=2,
                           shuffle=False, random_state=0, noise=10)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_regression, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile',
                                   param=20).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
예제 #17
0
def prepare_data(mydata = True):
    '''
    dim(X) -> (10,2)
    each_row(X) -> training point
    each_column(X) -> x_0, x_1

    dim(Y) -> (10,1)
    each_row(Y) -> result

    dim(theta) ->(2,1)
    theta[0][0] -> x_0
    theta[1][0] -> x_1
    Odd Even Linked List'''
    if mydata:
        num_trainingpoint = 3
        X =  np.array([range(num_trainingpoint)]).T
        theta = np.array([[1],[2]])
        x0 = np.ones(shape=(num_trainingpoint,1))
        m, n = np.shape(X)
        X = np.c_[ np.ones(m), X]
        Y =  X.dot(theta)
    else:
        X, Y = make_regression(n_samples=100, n_features=1, n_informative=1, 
                                random_state=0, noise=35) 
        m, n = np.shape(X)
        X = np.c_[ np.ones(m), X] # insert column

    theta = np.ones(shape=(2,1))

    return X, Y, theta
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")
    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features, noise=noise, coef=True)
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]
    idx = np.arange(n_train)
    np.random.seed(13)
    np.random.shuffle(idx)
    X_train = X_train[idx]
    y_train = y_train[idx]

    std = X_train.std(axis=0)
    mean = X_train.mean(axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    std = y_train.std(axis=0)
    mean = y_train.mean(axis=0)
    y_train = (y_train - mean) / std
    y_test = (y_test - mean) / std

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
예제 #19
0
def test_csr_sparse_center_data():
    # Test output format of sparse_center_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = sparse_center_data(csr, y, True)
    assert_equal(csr_.getformat(), 'csr')
예제 #20
0
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(
            n_samples=150,
            n_features=20,
            n_informative=n_informative,
            shuffle=False,
            random_state=random_state,
            noise=10,
        )

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.0
        false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives)
        return false_discovery_rate
예제 #21
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")
    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features,
                                 noise=noise,
                                 coef=True)
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]
    idx = np.arange(n_train)
    np.random.seed(13)
    np.random.shuffle(idx)
    X_train = X_train[idx]
    y_train = y_train[idx]

    std = X_train.std(axis=0)
    mean = X_train.mean(axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    std = y_train.std(axis=0)
    mean = y_train.mean(axis=0)
    y_train = (y_train - mean) / std
    y_test = (y_test - mean) / std

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
예제 #23
0
def test_regression_squared_loss():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=8, random_state=0)
    reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant", eta0=1e-2, random_state=0)

    reg.fit(X, y)
    pred = reg.predict(X)
    assert_almost_equal(np.mean((pred - y) ** 2), 4.913, 3)
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features,
                                 noise=noise,
                                 coef=True)

    random_seed = 13
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
예제 #25
0
def test_f_regression_select():
    print "==> a lot of features"
    X, y = make_regression(n_samples=20000,
                           n_features=200,
                           n_informative=150,
                           shuffle=False,
                           random_state=0)
    idx_sel = f_regression_select(X, y, verbose=2)
    print "==> few ones"
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           noise=0.5,
                           shuffle=False,
                           random_state=0)
    idx_sel = f_regression_select(X, y, verbose=1)
    print "tests ok"
예제 #26
0
def test_regression_squared_loss():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=8,
                           random_state=0)
    reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant",
                       eta0=1e-2, random_state=0)

    reg.fit(X, y)
    pred = reg.predict(X)
    assert_almost_equal(np.mean((pred - y) ** 2), 4.913, 3)
예제 #27
0
def test_regression_squared_loss_multiple_output():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=8, random_state=0)
    reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant", eta0=1e-2, random_state=0, max_iter=10)
    Y = np.zeros((len(y), 2))
    Y[:, 0] = y
    Y[:, 1] = y
    reg.fit(X, Y)
    pred = reg.predict(X)
    assert_almost_equal(np.mean((pred - Y) ** 2), 4.541, 3)
    def test_get_feature_coefficients(self):
        """
        test select_features_by_linear_model
        """
        X, y, _ = make_regression(n_samples=10000,
                                     n_features=100, noise=0.1, coef=True)

        fs = FeatureSelector(pd.DataFrame(X), pd.DataFrame(y))
        actual = fs.get_feature_coefficients(norm_prior=0)
        print(actual)
예제 #29
0
def make_dataset(request):
    nrows, ncols, n_info, datatype = request.param
    X, y = make_regression(n_samples=nrows, n_informative=n_info,
                           n_features=ncols, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=10)

    return nrows, datatype, X_train, X_test, y_train, y_test
    def test_get_feature_importances(self):
        """
        test get_feature_importances
        """
        X, y, coef = make_regression(n_samples=10000,
                                     n_features=100, noise=0.1, coef=True)

        fs = FeatureSelector(pd.DataFrame(X), pd.DataFrame(y))
        actual = fs.get_feature_importances(n_estimators=10)
        print(actual)
예제 #31
0
def test_invalid_percentile():
    X, y = make_regression(n_samples=10, n_features=20,
                           n_informative=2, shuffle=False, random_state=0)

    assert_raises(ValueError, SelectPercentile(percentile=-1).fit, X, y)
    assert_raises(ValueError, SelectPercentile(percentile=101).fit, X, y)
    assert_raises(ValueError, GenericUnivariateSelect(mode='percentile',
                                                      param=-1).fit, X, y)
    assert_raises(ValueError, GenericUnivariateSelect(mode='percentile',
                                                      param=101).fit, X, y)
예제 #32
0
def test_regression_squared_loss_multiple_output():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=8,
                           random_state=0)
    reg = SGDRegressor(loss="squared", penalty="l2", learning_rate="constant",
                       eta0=1e-2, random_state=0, max_iter=10)
    Y = np.zeros((len(y), 2))
    Y[:, 0] = y
    Y[:, 1] = y
    reg.fit(X, Y)
    pred = reg.predict(X)
    assert_almost_equal(np.mean((pred - Y) ** 2), 4.541, 3)
def main():
    
    # load the dataset to the two variables
    X, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) 
    m = np.shape(X)[0]
    X = np.c_[ np.ones(m), X]

    # get the slope
    theta = grad_desc_vector(X, y, 0.001, 1500)

    print theta   
예제 #34
0
def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    feature_names = [
        'f{:02d}'.format(n)
        for n in range(start_feature_num, start_feature_num + num_features)
    ]
    features = [dict(zip(feature_names, row)) for row in X]

    # convert the weights array into a dictionary for convenience
    weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)
def test_f_regression():
    """
    Test whether the F test yields meaningful results
    on a simple simulated regression problem
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    F, pv = f_regression(X, Y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.0e-4).all()
예제 #36
0
 def regression(self):
     from sklearn.datasets.samples_generator import make_regression
     # X为样本特征,y为样本输出, coef为回归系数,共200个样本,每个样本1个特征
     X, Y, coef = make_regression(n_samples=200,
                                  n_features=1,
                                  noise=20,
                                  coef=True)
     # 画图
     plt.scatter(X, Y, color='orange')
     plt.plot(X, X * coef, color='blue', linewidth=2)
     plt.xticks(())
     plt.yticks(())
     plt.show()
예제 #37
0
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
예제 #38
0
def test_linear_regression_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    reg = LinearRegression()
    reg.fit((X), Y)
    assert reg.coef_.shape == (2, n_features)
    Y_pred = reg.predict(X)
    reg.fit(X, y)
    y_pred = reg.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
예제 #39
0
def test_f_regression():
    """
    Test whether the F test yields meaningful results
    on a simple simulated regression problem
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    F, pv = f_regression(X, Y)
    assert(F > 0).all()
    assert(pv > 0).all()
    assert(pv < 1).all()
    assert(pv[:5] < 0.05).all()
    assert(pv[5:] > 1.e-4).all()
예제 #40
0
def test_linear_regression_multiple_outcome(random_state=0):
    "Test multiple-outcome linear regressions"
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    clf = LinearRegression(fit_intercept=True)
    clf.fit((X), Y)
    assert_equal(clf.coef_.shape, (2, n_features))
    Y_pred = clf.predict(X)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
예제 #41
0
파일: test.py 프로젝트: dmoliveira/malss
def test_regression_big():
    X, y = make_regression(n_samples=200000,
                           n_features=10,
                           n_informative=5,
                           noise=30.0,
                           random_state=0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    cls = MALSS(X, y, 'regression', n_jobs=3)
    cls.execute()
    # cls.make_report('test_regression_big')

    assert len(cls.algorithms) == 1
    assert cls.algorithms[0].best_score is not None
예제 #42
0
def test_linear_regression_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    clf = LinearRegression(fit_intercept=True)
    clf.fit((X), Y)
    assert_equal(clf.coef_.shape, (2, n_features))
    Y_pred = clf.predict(X)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
예제 #43
0
파일: test.py 프로젝트: dmoliveira/malss
def test_regression_big():
    X, y = make_regression(n_samples=200000,
                           n_features=10,
                           n_informative=5,
                           noise=30.0,
                           random_state=0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    cls = MALSS(X, y, 'regression', n_jobs=3)
    cls.execute()
    # cls.make_report('test_regression_big')

    assert len(cls.algorithms) == 1
    assert cls.algorithms[0].best_score is not None
예제 #44
0
def test_select_kbest_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the k best heuristic
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
예제 #45
0
def make_test_regression(n_features=30, n_informative=5, n_samples=5000):
    import pandas as pd
    X, y = make_regression(n_samples=n_samples, n_features=n_features,
                           n_informative=n_informative, noise=0.5,
                           shuffle=False, random_state=None)

    if False:
        idx_sel = f_regression_select(X, y, verbose=0)
        print("f_regression_select:", len(idx_sel), idx_sel)

    predictors = ["p{}".format(i) for i in range(X.shape[1])]
    target = 'y'
    df = pd.DataFrame(np.c_[X, y], columns=predictors+[target])
    # print(df.head())
    return df, predictors, target
예제 #46
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
예제 #47
0
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
def main():
  
    # load the dataset to the two variables
    x, y = make_regression(n_samples=100, n_features=1, n_informative=1, random_state=0, noise=35) 
    
    # criteria for the gradient descent
    learning_rate        = 0.1
    convergence_criteria = 0.01 
    
    # get the slope
    slope, intercept, iterations = grad_desc(x, y, learning_rate, convergence_criteria, 1000)

    print 'slope: ' + str(slope)
    print 'intercept: ' + str(intercept)
    print 'number of iterations: ' + str(iterations)
예제 #49
0
def test():

    x, y = make_regression(n_samples=100, n_features=1, n_informative=1, 
                        random_state=0, noise=35) 
    m, n = np.shape(x)
    x = np.c_[ np.ones(m), x] # insert column
    alpha = 0.01 # learning rate
    theta = gradient_descent_2(alpha, x, y, 1000)

    # plot
    for i in range(x.shape[1]):
        y_predict = theta[0] + theta[1]*x 
    pylab.plot(x[:,1],y,'o')
    pylab.plot(x,y_predict,'k-')
    pylab.show()
    print ("Done!")
예제 #50
0
def generate_dataset(n_train,n_test,n_features,noise=0.1):
    X,y = make_regression(n_samples=int(n_train+n_test),
                          n_features=int(n_features),
                          noise=noise,random_state=101)
    X_train=X[:n_train]
    X_test = X[n_train:]
    y_train = y[:n_train]
    y_test = y[n_train:]
    X_scaler = sklearn.preprocessing.StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train)
    y_test = y_scaler.transform(y_test)

    return X_train,X_test,y_train,y_test
def test_select_fdr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fdr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFdr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
예제 #52
0
def main():
    # generate data
    X, y = make_regression(n_samples=100,
                           n_features=1,
                           n_informative=1,
                           random_state=0,
                           noise=35)
    print("X.shape = {}  \ny.shape = {}".format(X.shape, y.shape))

    alpha = 0.003
    ep = 0.001

    theta = gradient_descent(X, y, alpha, ep)
    print('theta0 = {} \ntheta1 = {}'.format(theta[0], theta[1]))

    plot_regression_line(X, y, theta)
예제 #53
0
파일: test.py 프로젝트: shoshosho/malss
def test_regression_big():
    X, y = make_regression(n_samples=200000,
                           n_features=10,
                           n_informative=5,
                           noise=30.0,
                           random_state=0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    cls = MALSS('regression').fit(X, y, 'test_regression_big')
    cls.generate_module_sample()

    from sklearn.metrics import mean_squared_error
    pred = cls.predict(X)
    print(mean_squared_error(y, pred))

    assert len(cls.algorithms) == 1
    assert cls.algorithms[0].best_score is not None
예제 #54
0
파일: utils.py 프로젝트: MechCoder/skll
def make_regression_data(num_examples=100, train_test_ratio=0.5,
                         num_features=2, sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise, random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    feature_names = ['f{:02d}'.format(n) for n
                     in range(start_feature_num,
                              start_feature_num + num_features)]
    features = [dict(zip(feature_names, row)) for row in X]

    # convert the weights array into a dictionary for convenience
    weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(n_features=feature_bins) if
                  use_feature_hashing else None)
    train_fs = FeatureSet('regression_train', train_ids,
                          labels=train_y, features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test', test_ids,
                         labels=test_y, features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)
예제 #55
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X, y, True,
                                       copy=to_copy, check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
예제 #56
0
def test_f_regression():
    # Test whether the F test yields meaningful results
    # on a simple simulated regression problem
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0)

    F, pv = f_regression(X, y)
    assert_true((F > 0).all())
    assert_true((pv > 0).all())
    assert_true((pv < 1).all())
    assert_true((pv[:5] < 0.05).all())
    assert_true((pv[5:] > 1.e-4).all())

    # again without centering, compare with sparse
    F, pv = f_regression(X, y, center=False)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
예제 #57
0
def test_select_heuristics_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr, fdr or fwe heuristics
    """
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(
            f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
def load_data(n_samples=1024, n_features=1, n_informative=1, n_targets=1, random_state=1987, bias=13.17):
    """
    Using sklearn package to generate (X, y, theta) where theta = (theta_0, theta_1, ..., theta_{n_features})^T are parameters of the linear model

    Input: See sklearn.datasets.samples_generator.make_regression for more details
    Output: X     ~ n_samples * (n_features+1) including the addtional 1 vector
            y     ~ n_samples * n_targets
            theta ~ (n_features+1) * 1
    Usage: (X, y, theta) = load_data( ... )
    """

    from sklearn.datasets.samples_generator import make_regression
    X, y, theta = make_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  n_targets=n_targets,
                                  random_state=random_state,
                                  bias=bias,
                                  coef=True,)
    theta = np.insert(theta, 0, bias)
    X = np.insert(X, 0, 1, axis=1)
    return (X, y, theta)
예제 #59
0
def compute_bench(alpha, n_samples, n_features, precompute):

    lasso_results = []
    lars_lasso_results = []

    n_test_samples = 0
    it = 0

    for ns in n_samples:
        for nf in n_features:
            it += 1
            print '=================='
            print 'Iteration %s of %s' % (it, max(len(n_samples),
                                          len(n_features)))
            print '=================='
            n_informative = nf // 10
            X, Y, coef_ = make_regression(n_samples=ns, n_features=nf,
                                          n_informative=n_informative,
                                          noise=0.1, coef=True)

            X /= np.sqrt(np.sum(X ** 2, axis=0))  # Normalize data

            gc.collect()
            print "- benching Lasso"
            clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute)
            tstart = time()
            clf.fit(X, Y)
            lasso_results.append(time() - tstart)

            gc.collect()
            print "- benching LassoLars"
            clf = LassoLars(alpha=alpha, fit_intercept=False, 
		normalize=False, precompute=precompute)
            tstart = time()
            clf.fit(X, Y)
            lars_lasso_results.append(time() - tstart)

    return lasso_results, lars_lasso_results
예제 #60
0
from sklearn.metrics import mean_squared_error
from sklearn.datasets.samples_generator import make_regression

if __name__ == "__main__":
    list_n_samples = np.linspace(100, 10000, 5).astype(np.int)
    list_n_features = [10, 100, 1000]
    n_test = 1000
    noise = 0.1
    alpha = 0.01
    sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2))
    elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2))
    ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2))
    for i, n_train in enumerate(list_n_samples):
        for j, n_features in enumerate(list_n_features):
            X, y, coef = make_regression(
                n_samples=n_train + n_test, n_features=n_features,
                noise=noise, coef=True)

            X_train = X[:n_train]
            y_train = y[:n_train]
            X_test = X[n_train:]
            y_test = y[n_train:]

            print("=======================")
            print("Round %d %d" % (i, j))
            print("n_features:", n_features)
            print("n_samples:", n_train)

            # Shuffle data
            idx = np.arange(n_train)
            np.random.seed(13)