示例#1
0
 def predict_price(total_amount, trip_distance, passenger_count):
     # Create a dataframe out of the three columns
     # and pass it to dask-xgboost, to predict
     # distributed
     X = dd.concat([total_amount, trip_distance, passenger_count],
                   axis=1).astype("float64")
     return dask_xgboost.predict(client, bst, X)
示例#2
0
def test_numpy(c, s, a, b):
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2, ))
    dbst = yield dxgb.train(c, param, dX, dy)
    dbst = yield dxgb.train(c, param, dX, dy)  # we can do this twice

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)
    predictions = yield c.compute(predictions)
    _test_container(dbst, predictions, np.array)
示例#3
0
def test_numpy(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2, ))
    dbst = yield dxgb.train(c, param, dX, dy)
    dbst = yield dxgb.train(c, param, dX, dy)  # we can do this twice

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)
    predictions = yield c.compute(predictions)
    _test_container(dbst, predictions, np.array)
示例#4
0
def test_sparse(c, s, a, b):
    dX = da.from_array(X, chunks=(2, 2)).map_blocks(sparse.COO)
    dy = da.from_array(y, chunks=(2, ))
    dbst = yield dxgb.train(c, param, dX, dy)
    dbst = yield dxgb.train(c, param, dX, dy)  # we can do this twice

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)

    predictions_result = yield c.compute(predictions)
    _test_container(dbst, predictions_result, sparse.COO)
示例#5
0
def test_sparse(c, s, a, b):
    xgb.rabit.init()  # workaround for "Doing rabit call after Finalize"
    dX = da.from_array(X, chunks=(2, 2)).map_blocks(scipy.sparse.csr_matrix)
    dy = da.from_array(y, chunks=(2, ))
    dbst = yield dxgb.train(c, param, dX, dy)
    dbst = yield dxgb.train(c, param, dX, dy)  # we can do this twice

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)

    predictions_result = yield c.compute(predictions)
    _test_container(dbst, predictions_result, scipy.sparse.csr_matrix)
示例#6
0
def test_daskxgboost(startDaskClient) :
    client = startDaskClient
    import dask.dataframe as dd
    df = dd.read_csv('...')  # use dask.dataframe to load and
    df_train = False           # preprocess data
    labels_train = False

    import dask_xgboost as dxgb
    params = {'objective': 'binary:logistic', }  # use normal xgboost params
    bst = dxgb.train(client, params, df_train, labels_train)

    predictions = dxgb.predict(client, bsg, data_test)
示例#7
0
def daskml_regressor(client, data_train, data_test, labels_train, labels_test):
    print("\n\n***** Dask ml XGBoost *****")
    start = time.time()

    from config import param_grid_xgboost

    bst = dxgb.train(client, param_grid_xgboost, data_train, labels_train)
    pdxgb_train_time = str(time.time() - start)

    predictions = dxgb.predict(client, bst, data_test).persist()

    accuracy = roc_auc_score(labels_test.compute(), predictions.compute())
    print("Accuracy:", accuracy)
    print("- Done")

    return [0, pdxgb_train_time, accuracy]
示例#8
0
def main():
    object = ps.preprocess()
    X_train, X_test, y_train, y_test = object.cleaning()
    params = {'objective': 'binary:logistic',
          'max_depth': 8, 'eta': 0.01, 'subsample': 0.5,
          'min_child_weight': 1}
    print ("Start training dxgb")

    cluster = LocalCluster(n_workers=8, threads_per_worker=1)
    client = Client(cluster)
    start_time = time.time()
    bst = dxgb.train(client, params, X_train, y_train)
    end_time = time.time()
    #time difference in dXGB is 1588108665
    print ("time difference in dXGB is %d seonds" % end_time)
    predictions = dxgb.predict(client, bst, X_test)
    #Accuracy = 0.6968888393419537
    print ("Accuracy score is : ")
    print(roc_auc_score(y_test.compute(),
                    predictions.compute()))
    client.shutdown()
示例#9
0
def test_basic(c, s, a, b):
    dtrain = xgb.DMatrix(df, label=labels)
    bst = xgb.train(param, dtrain)

    ddf = dd.from_pandas(df, npartitions=4)
    dlabels = dd.from_pandas(labels, npartitions=4)
    dbst = yield dxgb.train(c, param, ddf, dlabels)
    dbst = yield dxgb.train(c, param, ddf, dlabels)  # we can do this twice

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == labels
    dcorrect = (dresult > 0.5) == labels
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, ddf)
    assert isinstance(predictions, dd.Series)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, pd.Series)

    assert ((predictions > 0.5) != labels).sum() < 2
示例#10
0
def test_numpy(c, s, a, b):
    dtrain = xgb.DMatrix(X, label=y)
    bst = xgb.train(param, dtrain)

    dX = da.from_array(X, chunks=(2, 2))
    dy = da.from_array(y, chunks=(2, ))
    dbst = yield dxgb._train(c, param, dX, dy)
    dbst = yield dxgb._train(c, param, dX, dy)  # we can do this twice

    result = bst.predict(dtrain)
    dresult = dbst.predict(dtrain)

    correct = (result > 0.5) == y
    dcorrect = (dresult > 0.5) == y
    assert dcorrect.sum() >= correct.sum()

    predictions = dxgb.predict(c, dbst, dX)
    assert isinstance(predictions, da.Array)
    predictions = yield c.compute(predictions)._result()
    assert isinstance(predictions, np.ndarray)

    assert ((predictions > 0.5) != labels).sum() < 2
示例#11
0
# In[6]:

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 100000,
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5
}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100)

# In[7]:

y_hat = dask_xgboost.predict(client, bst, X_test).persist()
y_hat

# In[8]:

r = r2_score(y_test.compute(), y_hat.compute())
mae = mean_absolute_error(y_test.compute(), y_hat.compute())
mse = mean_squared_error(y_test.compute(), y_hat.compute())
print("R^2:", r)
print("MAE:", mae)
print("MSE:", mse)

# In[9]:

from dask_ml.datasets import make_classification
示例#12
0
for col in vars_cat:
  dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col])
  
X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True)      
y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0)  

X_train = X_all[0:d_train.shape[0],]
y_train = y_all[0:d_train.shape[0]]
X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),]
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]

X_train.persist()
y_train.persist()

client.has_what()


param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1}             
%time md = dxgb.train(client, param, X_train, y_train, num_boost_round = 100)


y_pred = dxgb.predict(client, md, X_test)
y_pred_loc = y_pred.compute()
y_test_loc = y_test.compute()
print(metrics.roc_auc_score(y_test_loc, y_pred_loc))


## m5.4xlarge 16c (8+8HT)
## Wall time: 34.3 s
## 0.7928378346764724
示例#13
0
def main():
    print("Setting up data directory")
    print("-------------------------")

    #flights(args.url)
    columns = ['Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'Origin']
    data_dir = 'data'
    target = 'DepDelay'
    log = ''
    results = {}

    df = get_df(columns).dropna()
    is_dask = True

    client = None
    if is_dask:
        client = Client(n_workers=20,
                        threads_per_worker=20,
                        memory_limit='1GB')

    model = GradientBoostingRegressor(random_state=18)
    params = {'max_depth': [2, 3], 'n_estimators': [1, 2, 3]}
    X_train, X_test, y_train, y_test = get_data(df.copy(),
                                                target,
                                                is_dask=False,
                                                chunksize=200)
    results = dict()
    clf_name = type(model).__name__

    clf_cv = GridSearchCV(model,
                          param_grid=params,
                          cv=StratifiedKFold(n_splits=10,
                                             shuffle=True,
                                             random_state=18),
                          scoring='neg_mean_squared_error')

    with joblib.parallel_backend("dask" if is_dask else 'loky'):
        start = time.time()
        clf_cv.fit(X_train, y_train)
        end = time.time()

    y_predict_train = clf_cv.best_estimator_.predict(X_train)
    y_predict_test = clf_cv.best_estimator_.predict(X_test)

    train_error = mean_squared_error(
        y_train,
        y_predict_train,
    )
    test_error = mean_squared_error(
        y_test,
        y_predict_test,
    )
    best_params = clf_cv.best_params_

    results['Scikit XGBoost'] = {
        'train_error': train_error,
        'test_error': test_error,
        'time': end - start
    }
    log += 'Scikit XGBoost train_error: %.2f, test_error: %.2f, took: %.2f\n' % (
        train_error, test_error, end - start)

    is_dask = True
    X_train, X_test, y_train, y_test = get_data(df.copy(),
                                                target,
                                                is_dask=is_dask,
                                                chunksize=200)
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 3,
        'eta': 0.01,
        'subsample': 0.5,
        'min_child_weight': 0.2
    }

    start = time.time()
    bst = dask_xgboost.train(client,
                             params,
                             X_train,
                             y_train,
                             num_boost_round=10)
    end = time.time()

    y_train_pred = dask_xgboost.predict(client, bst, X_train).persist()
    y_test_pred = dask_xgboost.predict(client, bst, X_test).persist()

    y_train, y_train_pred = dask.compute(y_train, y_train_pred)
    y_test, y_test_pred = dask.compute(y_test, y_test_pred)

    train_error = mean_squared_error(y_train, y_train_pred)
    test_error = mean_squared_error(y_test, y_test_pred)

    log += 'Dask XGBoost train_error: %.2f, test_error: %.2f, took: %.2f' % (
        train_error, test_error, end - start)
    results['Dask XGBoost'] = {
        'train_error': train_error,
        'test_error': test_error,
        'time': end - start
    }

    with open('results.txt', 'w') as outfile:
        json.dump(results, outfile)

    print('Finished!')
示例#14
0
def task(df, ram_to_use, is_dask):
    client = None
    if is_dask:
        client = Client(threads_per_worker=10,
                        n_workers=10,
                        memory_limit=''.join([str(ram_to_use), 'GB']))

    models = [
        Ridge(random_state=42),
        GradientBoostingRegressor(random_state=42),
    ][:1 if is_dask else 2]

    params = [
        {
            "alpha": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        },
        {
            'max_depth': [2, 3, 4, 6],
            'n_estimators': [2, 3, 4, 5],
        },
    ][:1 if is_dask else 2]

    X_train, X_test, y_train, y_test = get_dask_data(
        df.copy(), 'DepDelay') if is_dask else get_normal_data(
            df.copy(), 'DepDelay')

    for model, param in zip(models, params):
        t_start = time.time()
        results, _, _ = run_single_model(model,
                                         param,
                                         X_train,
                                         X_test,
                                         y_train,
                                         y_test,
                                         is_dask=is_dask)
        model_name = type(model).__name__
        train_error, test_error = results[model_name]['metric'][
            'mean_squared_error']
        t_end = time.time()
        time_took = round(t_end - t_start, 3)

        dict_saver = {}
        dict_saver.update(
            {'model_name': model_name + ('_dask' if is_dask else '')})
        dict_saver.update({'train_error(MSE)': train_error})
        dict_saver.update({'test_error(MSE)': test_error})
        dict_saver.update({'time': time_took})
        save_to_file(file_to_save_path, dict_saver)

        print(model_name, ':\t took ->', time_took,
              '\t with error (train, test)', (train_error, test_error))

    if is_dask:
        params = {
            'objective': 'reg:squarederror',
            'max_depth': 4,
            'eta': 0.01,
            'subsample': 0.5,
            'min_child_weight': 0.5
        }

        t_start = time.time()
        bst = dask_xgboost.train(client,
                                 params,
                                 X_train,
                                 y_train,
                                 num_boost_round=10)
        t_end = time.time()
        time_took = round(t_end - t_start, 3)

        y_train_hat = dask_xgboost.predict(client, bst, X_train).persist()
        y_test_hat = dask_xgboost.predict(client, bst, X_test).persist()

        y_train, y_train_hat = dask.compute(y_train, y_train_hat)
        y_test, y_test_hat = dask.compute(y_test, y_test_hat)

        train_error = mean_squared_error(y_train, y_train_hat)
        test_error = mean_squared_error(y_test, y_test_hat)

        dict_saver = {}
        dict_saver.update({'model_name': 'Dask XGBoost' + '_dask'})
        dict_saver.update({'train_error(MSE)': train_error})
        dict_saver.update({'test_error(MSE)': test_error})
        dict_saver.update({'time': time_took})
        save_to_file(file_to_save_path, dict_saver)

        print('Dask XGBoost', ':\t took ->', time_took,
              '\t with error (train, test)', (train_error, test_error))