def test_random_split(spark_session): spark_df = spark_session.range(0, 10) splits = utils.random_split(spark_df, [0.7, 0.3]) assert len(splits) == 2 koalas_df = ks.range(0, 10) splits = utils.random_split(koalas_df, [0.7, 0.3]) assert isinstance(splits[0], ks.DataFrame) assert isinstance(splits[1], ks.DataFrame) assert len(splits) == 2
def test_torch_estimator(spark_on_ray_small): # ---------------- data process with koalas ------------ spark = spark_on_ray_small # calculate z = 3 * x + 4 * y + 5 df: ks.DataFrame = ks.range(0, 100000) df["x"] = df["id"] + 100 df["y"] = df["id"] + 1000 df["z"] = df["x"] * 3 + df["y"] * 4 + 5 df = df.astype("float") train_df, test_df = random_split(df, [0.7, 0.3]) # ---------------- ray sgd ------------------------- # create the model class LinearModel(torch.nn.Module): def __init__(self): super(LinearModel, self).__init__() self.linear = torch.nn.Linear(2, 1) def forward(self, x, y): x = torch.cat([x, y], dim=1) return self.linear(x) model = LinearModel() # create the optimizer optimizer = torch.optim.Adam(model.parameters()) # create the loss loss = torch.nn.MSELoss() # create lr_scheduler def lr_scheduler_creator(optimizer, config): return torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 250, 350], gamma=0.1) # create the estimator estimator = TorchEstimator(num_workers=1, model=model, optimizer=optimizer, loss=loss, lr_scheduler_creator=lr_scheduler_creator, feature_columns=["x", "y"], label_column="z", batch_size=1000, num_epochs=2, use_gpu=False) # train the model estimator.fit_on_spark(train_df, test_df) estimator.shutdown()
def test_torch_estimator(ray_cluster): # ---------------- data process with koalas ------------ app_name = "A simple example for spark on ray" context.init_spark(app_name, 2, 1, "500MB") # calculate z = 3 * x + 4 * y + 5 df: ks.DataFrame = ks.range(0, 100000) df["x"] = df["id"] + 100 df["y"] = df["id"] + 1000 df["z"] = df["x"] * 3 + df["y"] * 4 + 5 df = df.astype("float") train_df, test_df = random_split(df, [0.7, 0.3]) # ---------------- ray sgd ------------------------- # create the model model = torch.nn.Sequential(torch.nn.Linear(2, 1)) # create the optimizer optimizer = torch.optim.Adam(model.parameters()) # create the loss loss = torch.nn.MSELoss() # create lr_scheduler def lr_scheduler_creator(optimizer, config): return torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[150, 250, 350], gamma=0.1) # create the estimator estimator = TorchEstimator(num_workers=2, model=model, optimizer=optimizer, loss=loss, lr_scheduler_creator=lr_scheduler_creator, feature_columns=["x", "y"], label_column="z", batch_size=1000, num_epochs=2) # train the model estimator.fit_on_spark(train_df) # evaluate the model estimator.evaluate_on_spark(test_df) # get the model model = estimator.get_model() print(list(model.parameters())) estimator.shutdown() context.stop_spark()
def test_tf_estimator(ray_cluster): spark = context.init_spark("test", 2, 1, "500M") # ---------------- data process with Spark ------------ # calculate z = 3 * x + 4 * y + 5 df: pyspark.sql.DataFrame = spark.range(0, 100000) df = df.withColumn("x", rand() * 100) # add x column df = df.withColumn("y", rand() * 1000) # ad y column df = df.withColumn("z", df.x * 3 + df.y * 4 + rand() + 5) # ad z column df = df.select(df.x, df.y, df.z) train_df, test_df = random_split(df, [0.7, 0.3]) # create model input_1 = keras.Input(shape=(1, )) input_2 = keras.Input(shape=(1, )) concatenated = keras.layers.concatenate([input_1, input_2]) output = keras.layers.Dense(1, activation='sigmoid')(concatenated) model = keras.Model(inputs=[input_1, input_2], outputs=output) optimizer = keras.optimizers.Adam(0.01) loss = keras.losses.MeanSquaredError() estimator = TFEstimator(num_workers=2, model=model, optimizer=optimizer, loss=loss, metrics=["accuracy", "mse"], feature_columns=["x", "y"], label_column="z", batch_size=1000, num_epochs=2, config={"fit_config": { "steps_per_epoch": 2 }}) estimator.fit_on_spark(train_df) estimator.evaluate_on_spark(test_df) estimator.shutdown() context.stop_spark()
# After ray.init, you can use the raydp api to get a spark session app_name = "NYC Taxi Fare Prediction with RayDP" num_executors = 1 cores_per_executor = 1 memory_per_executor = "500M" spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor) data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") # Transform the dataset data = nyc_taxi_preprocess(data) # Split data into train_dataset and test_dataset train_df, test_df = random_split(data, [0.9, 0.1], 0) # Convert spark dataframe into ML Dataset train_dataset = RayMLDataset.from_spark(train_df, 2, 32) test_dataset = RayMLDataset.from_spark(test_df, 2, 32) # Then convert them into DMatrix used by xgboost dtrain = RayDMatrix(train_dataset, label='fare_amount') dtest = RayDMatrix(test_dataset, label='fare_amount') # Configure the XGBoost model config = { "tree_method": "hist", "eval_metric": ["logloss", "error"], } evals_result = {} # Train the model bst = train(config, dtrain,