示例#1
0
def _map_to_lgb_ray_params(params: Dict[str, Any]) -> Dict[str, Any]:
    from lightgbm_ray import RayParams

    ray_params = {}
    for key, value in params.items():
        if key == "num_workers":
            ray_params["num_actors"] = value
        elif key == "resources_per_worker":
            if "CPU" in value:
                ray_params["cpus_per_actor"] = value["CPU"]
            if "GPU" in value:
                ray_params["gpus_per_actor"] = value["GPU"]
    ray_params = RayParams(**ray_params)
    ray_params.allow_less_than_two_cpus = True
    return ray_params
示例#2
0
from lightgbm_ray import RayParams

from ray.util.lightgbm.release_test_util import (
    train_ray,
    FailureState,
    FailureInjection,
    TrackingCallback,
)

if __name__ == "__main__":
    ray.init(address="auto")

    failure_state = FailureState.remote()

    ray_params = RayParams(max_actor_restarts=2,
                           num_actors=4,
                           cpus_per_actor=4,
                           gpus_per_actor=0)

    _, additional_results, _ = train_ray(
        path="/data/classification.parquet",
        num_workers=None,
        num_boost_rounds=100,
        num_files=200,
        regression=False,
        use_gpu=False,
        ray_params=ray_params,
        lightgbm_params=None,
        callbacks=[
            TrackingCallback(),
            FailureInjection(id="first_fail",
                             state=failure_state,
示例#3
0
from ray.util.lightgbm.release_test_util import train_ray

if __name__ == "__main__":
    addr = os.environ.get("RAY_ADDRESS")
    job_name = os.environ.get("RAY_JOB_NAME", "train_small")
    if addr.startswith("anyscale://"):
        ray.init(address=addr, job_name=job_name)
    else:
        ray.init(address="auto")

    output = os.environ["TEST_OUTPUT_JSON"]
    ray_params = RayParams(
        elastic_training=False,
        max_actor_restarts=2,
        num_actors=4,
        cpus_per_actor=4,
        gpus_per_actor=0,
    )

    start = time.time()

    @ray.remote(num_cpus=0)
    def train():
        os.environ["TEST_OUTPUT_JSON"] = output
        train_ray(
            path="/data/classification.parquet",
            num_workers=4,
            num_boost_rounds=100,
            num_files=25,
            regression=False,
示例#4
0
        lightgbm_params=config,
    )


if __name__ == "__main__":
    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray.init(address="auto")

    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=2,
                           num_actors=32,
                           cpus_per_actor=1,
                           gpus_per_actor=0)

    start = time.time()
    analysis = tune.run(tune.with_parameters(train_wrapper,
                                             ray_params=ray_params),
                        config=search_space,
                        num_samples=4,
                        resources_per_trial=ray_params.get_tune_resources())
    taken = time.time() - start

    result = {
        "time_taken": taken,
        "trial_states":
        dict(Counter([trial.status for trial in analysis.trials]))
示例#5
0
# Create Classification version of target variable
df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']]
X = df.drop(['quality', 'goodquality'], axis=1)
y = df['goodquality']
print(df['goodquality'].value_counts())

# Normalize feature variables
X_features = X
X = StandardScaler().fit_transform(X)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=0)

model = RayLGBMClassifier(
    #    n_jobs=4,               # In LightGBM-Ray, n_jobs sets the number of actors
    random_state=42)

start = time.time()
model.fit(X=X_train, y=y_train, ray_params=RayParams(num_actors=3))
print(f"executed LightGBM in {time.time() - start}")
y_pred = model.predict(X_test)
#converting probabilities into 0 or 1
for i in range(len(y_pred)):
    if y_pred[i] >= .5:  # setting threshold to .5
        y_pred[i] = 1
    else:
        y_pred[i] = 0
print(classification_report(y_test, y_pred))
示例#6
0
def train_ray(path,
              num_workers,
              num_boost_rounds,
              num_files=0,
              regression=False,
              use_gpu=False,
              ray_params=None,
              lightgbm_params=None,
              **kwargs):
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        raise ValueError(f"Path does not exist: {path}")

    if num_files:
        files = sorted(glob.glob(f"{path}/**/*.parquet"))
        while num_files > len(files):
            files = files + files
        path = files[0:num_files]

    use_device_matrix = False
    if use_gpu:
        try:
            import cupy  # noqa: F401
            use_device_matrix = True
        except ImportError:
            use_device_matrix = False

    if use_device_matrix:
        dtrain = RayDeviceQuantileDMatrix(path,
                                          num_actors=num_workers,
                                          label="labels",
                                          ignore=["partition"],
                                          filetype=RayFileType.PARQUET)
    else:
        dtrain = RayDMatrix(path,
                            num_actors=num_workers,
                            label="labels",
                            ignore=["partition"],
                            filetype=RayFileType.PARQUET)

    config = {"device": "cpu" if not use_gpu else "gpu"}

    if not regression:
        # Classification
        config.update({
            "objective": "binary",
            "metric": ["binary_logloss", "binary_error"],
        })
    else:
        # Regression
        config.update({
            "objective": "regression",
            "metric": ["l2", "rmse"],
        })

    if lightgbm_params:
        config.update(lightgbm_params)

    start = time.time()
    evals_result = {}
    additional_results = {}
    bst = train(config,
                dtrain,
                evals_result=evals_result,
                additional_results=additional_results,
                num_boost_round=num_boost_rounds,
                ray_params=ray_params
                or RayParams(max_actor_restarts=2,
                             num_actors=num_workers,
                             cpus_per_actor=2,
                             gpus_per_actor=0 if not use_gpu else 1),
                evals=[(dtrain, "train")],
                **kwargs)
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    out_file = os.path.expanduser(
        "~/benchmark_{}.lgbm".format("cpu" if not use_gpu else "gpu"))
    bst.booster_.save_model(out_file)

    print("Final training error: {:.4f}".format(
        evals_result["train"]["binary_error" if not regression else "rmse"]
        [-1]))
    return bst, additional_results, taken