示例#1
0
def main(client):
    m = 100000
    n = 100
    X, y = make_regression(n_samples=m,
                           n_features=n,
                           chunks=200,
                           random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dtrain = DaskDMatrix(client, X_train, y_train)
    dtest = DaskDMatrix(client, X_test, y_test)

    output = xgb.dask.train(
        client,
        {
            "verbosity": 1,
            "tree_method": "hist",
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "max_depth": 6,
            "learning_rate": 1.0,
        },
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dtest, "test")],
        callbacks=[
            CustomEarlyStopping(validation_set="test",
                                target_metric="rmse",
                                maximize=False,
                                seed=0)
        ],
    )
示例#2
0
def test_lm(fit_intercept):
    X, y = make_regression(n_samples=100, n_features=5, chunks=50)
    lr = LinearRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    if fit_intercept:
        assert lr.intercept_ is not None
示例#3
0
def single_chunk_regression():
    """X, y pair for regression.

    The `X` and `y` have a single block, so chunksize is 100.
    Useful for testing `partial_fit` methods.
    """
    X, y = make_regression(chunks=100, random_state=0)
    return X, y
示例#4
0
def make_fake_regression(ncols=10, nrows=100):
    ddf = dask_datasets.make_regression(
        n_samples=nrows,
        n_features=ncols,
        n_informative=10,
        n_targets=1,
        bias=0.0,
        effective_rank=None,
        tail_strength=0.5,
        noise=0.0,
        shuffle=True,
        coef=False,
        random_state=None,
        chunks=None,
    )
    return ddf
示例#5
0
    def _prep_data(self, reg=False):
        self.n_samples = int(1e5)
        self.chunk_size = int(1e4)
        self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int)

        if reg:
            self.x, self.y = make_regression(n_samples=self.n_samples,
                                             chunks=self.chunk_size,
                                             random_state=0,
                                             n_features=40)
        else:
            self.x, self.y = make_blobs(n_samples=self.n_samples,
                                        chunks=self.chunk_size,
                                        random_state=0,
                                        n_features=40,
                                        centers=2,
                                        cluster_std=100)

        return self
示例#6
0
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

# https://www.kaggle.com/puneetgrover/speed-up-your-algorithms-dask
# dask_kaggle_Regression

# In[1]:

from dask_ml.datasets import make_regression
import dask.dataframe as dd

X, y = make_regression(n_samples=1e6, chunks=50000)

# In[2]:

df = dd.from_dask_array(X)
df.head()

# In[3]:

from dask_ml.model_selection import train_test_split, GridSearchCV

xtr, ytr, xval, yval = train_test_split(X, y)

# In[ ]:
示例#7
0
jobid = os.getenv('SLURM_JOBID')
client = Client(scheduler_file='scheduler_%s.json' % jobid)

print(f'Job_id:{jobid}')

# In[3]:

client

# In[4]:

from dask_ml.datasets import make_regression

X, y = make_regression(n_samples=4000000,
                       n_features=32,
                       chunks=1000,
                       n_informative=10,
                       random_state=101)

# In[5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# In[6]:

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 100000,
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
示例#8
0
def xy_regression():
    """X, y pair for classification"""
    X, y = make_regression(chunks=10, random_state=0)
    return X, y