예제 #1
0
def test_random_split(spark_session):
    spark_df = spark_session.range(0, 10)
    splits = utils.random_split(spark_df, [0.7, 0.3])
    assert len(splits) == 2

    koalas_df = ks.range(0, 10)
    splits = utils.random_split(koalas_df, [0.7, 0.3])
    assert isinstance(splits[0], ks.DataFrame)
    assert isinstance(splits[1], ks.DataFrame)
    assert len(splits) == 2
예제 #2
0
def test_torch_estimator(spark_on_ray_small):
    # ---------------- data process with koalas ------------
    spark = spark_on_ray_small

    # calculate z = 3 * x + 4 * y + 5
    df: ks.DataFrame = ks.range(0, 100000)
    df["x"] = df["id"] + 100
    df["y"] = df["id"] + 1000
    df["z"] = df["x"] * 3 + df["y"] * 4 + 5
    df = df.astype("float")

    train_df, test_df = random_split(df, [0.7, 0.3])

    # ---------------- ray sgd -------------------------
    # create the model
    class LinearModel(torch.nn.Module):
        def __init__(self):
            super(LinearModel, self).__init__()
            self.linear = torch.nn.Linear(2, 1)

        def forward(self, x, y):
            x = torch.cat([x, y], dim=1)
            return self.linear(x)

    model = LinearModel()
    # create the optimizer
    optimizer = torch.optim.Adam(model.parameters())
    # create the loss
    loss = torch.nn.MSELoss()

    # create lr_scheduler

    def lr_scheduler_creator(optimizer, config):
        return torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                    milestones=[150, 250, 350],
                                                    gamma=0.1)

    # create the estimator
    estimator = TorchEstimator(num_workers=1,
                               model=model,
                               optimizer=optimizer,
                               loss=loss,
                               lr_scheduler_creator=lr_scheduler_creator,
                               feature_columns=["x", "y"],
                               label_column="z",
                               batch_size=1000,
                               num_epochs=2,
                               use_gpu=False)

    # train the model
    estimator.fit_on_spark(train_df, test_df)

    estimator.shutdown()
예제 #3
0
def test_torch_estimator(ray_cluster):
    # ---------------- data process with koalas ------------
    app_name = "A simple example for spark on ray"
    context.init_spark(app_name, 2, 1, "500MB")

    # calculate z = 3 * x + 4 * y + 5
    df: ks.DataFrame = ks.range(0, 100000)
    df["x"] = df["id"] + 100
    df["y"] = df["id"] + 1000
    df["z"] = df["x"] * 3 + df["y"] * 4 + 5
    df = df.astype("float")

    train_df, test_df = random_split(df, [0.7, 0.3])

    # ---------------- ray sgd -------------------------
    # create the model
    model = torch.nn.Sequential(torch.nn.Linear(2, 1))
    # create the optimizer
    optimizer = torch.optim.Adam(model.parameters())
    # create the loss
    loss = torch.nn.MSELoss()
    # create lr_scheduler

    def lr_scheduler_creator(optimizer, config):
        return torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[150, 250, 350], gamma=0.1)

    # create the estimator
    estimator = TorchEstimator(num_workers=2,
                               model=model,
                               optimizer=optimizer,
                               loss=loss,
                               lr_scheduler_creator=lr_scheduler_creator,
                               feature_columns=["x", "y"],
                               label_column="z",
                               batch_size=1000,
                               num_epochs=2)

    # train the model
    estimator.fit_on_spark(train_df)
    # evaluate the model
    estimator.evaluate_on_spark(test_df)

    # get the model
    model = estimator.get_model()
    print(list(model.parameters()))

    estimator.shutdown()
    context.stop_spark()
예제 #4
0
def test_tf_estimator(ray_cluster):
    spark = context.init_spark("test", 2, 1, "500M")

    # ---------------- data process with Spark ------------
    # calculate z = 3 * x + 4 * y + 5
    df: pyspark.sql.DataFrame = spark.range(0, 100000)
    df = df.withColumn("x", rand() * 100)  # add x column
    df = df.withColumn("y", rand() * 1000)  # ad y column
    df = df.withColumn("z", df.x * 3 + df.y * 4 + rand() + 5)  # ad z column
    df = df.select(df.x, df.y, df.z)

    train_df, test_df = random_split(df, [0.7, 0.3])

    # create model
    input_1 = keras.Input(shape=(1, ))
    input_2 = keras.Input(shape=(1, ))

    concatenated = keras.layers.concatenate([input_1, input_2])
    output = keras.layers.Dense(1, activation='sigmoid')(concatenated)
    model = keras.Model(inputs=[input_1, input_2], outputs=output)

    optimizer = keras.optimizers.Adam(0.01)
    loss = keras.losses.MeanSquaredError()

    estimator = TFEstimator(num_workers=2,
                            model=model,
                            optimizer=optimizer,
                            loss=loss,
                            metrics=["accuracy", "mse"],
                            feature_columns=["x", "y"],
                            label_column="z",
                            batch_size=1000,
                            num_epochs=2,
                            config={"fit_config": {
                                "steps_per_epoch": 2
                            }})

    estimator.fit_on_spark(train_df)
    estimator.evaluate_on_spark(test_df)

    estimator.shutdown()
    context.stop_spark()
예제 #5
0
# After ray.init, you can use the raydp api to get a spark session
app_name = "NYC Taxi Fare Prediction with RayDP"
num_executors = 1
cores_per_executor = 1
memory_per_executor = "500M"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor,
                         memory_per_executor)
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load(NYC_TRAIN_CSV)
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
# Convert spark dataframe into ML Dataset
train_dataset = RayMLDataset.from_spark(train_df, 2, 32)
test_dataset = RayMLDataset.from_spark(test_df, 2, 32)
# Then convert them into DMatrix used by xgboost
dtrain = RayDMatrix(train_dataset, label='fare_amount')
dtest = RayDMatrix(test_dataset, label='fare_amount')
# Configure the XGBoost model
config = {
    "tree_method": "hist",
    "eval_metric": ["logloss", "error"],
}
evals_result = {}
# Train the model
bst = train(config,
            dtrain,