def spark_on_ray_small(request): ray.init(num_cpus=2, include_dashboard=False) spark = raydp.init_spark("test", 1, 1, "500 M") def stop_all(): raydp.stop_spark() ray.shutdown() request.addfinalizer(stop_all) return spark
def spark_on_ray_small(request): ray.init(include_java=True, redis_password="******", include_dashboard=False) spark = raydp.init_spark("test", 1, 1, "500 M") def stop_all(): raydp.stop_spark() ray.shutdown() request.addfinalizer(stop_all) return spark
def process_data(): app_name = "NYC Taxi Fare Prediction with RayDP" num_executors = 1 cores_per_executor = 1 memory_per_executor = "500M" # Use RayDP to perform data processing spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor) data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") data = nyc_taxi_preprocess(data) ds = RayMLDataset.from_spark(data, 1, args.batch_size) features = [ field.name for field in list(data.schema) if field.name != "fare_amount" ] return ds.to_torch(feature_columns=features, label_column="fare_amount"), len(features)
# Please refer to https://docs.ray.io/en/latest/xgboost-ray.html to install it. from xgboost_ray import RayDMatrix, train, RayParams import raydp from raydp.utils import random_split from raydp.spark import RayMLDataset from data_process import nyc_taxi_preprocess, NYC_TRAIN_CSV # connect to ray cluster # ray.init(address='auto') ray.init() # After ray.init, you can use the raydp api to get a spark session app_name = "NYC Taxi Fare Prediction with RayDP" num_executors = 1 cores_per_executor = 1 memory_per_executor = "500M" spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor) data = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") # Transform the dataset data = nyc_taxi_preprocess(data) # Split data into train_dataset and test_dataset train_df, test_df = random_split(data, [0.9, 0.1], 0) # Convert spark dataframe into ML Dataset train_dataset = RayMLDataset.from_spark(train_df, 2, 32) test_dataset = RayMLDataset.from_spark(test_df, 2, 32) # Then convert them into DMatrix used by xgboost dtrain = RayDMatrix(train_dataset, label='fare_amount') dtest = RayDMatrix(test_dataset, label='fare_amount')
def __init__(self): self.spark = raydp.init_spark(app_name="test_spark_remote", num_executors=1, executor_cores=1, executor_memory="500MB")
.drop("dropoff_latitude") \ .drop("passenger_count") \ .drop("key") return data def nyc_taxi_preprocess(data): data = clean_up(data) data = add_time_features(data) data = add_distance_features(data) return drop_col(data) if __name__ == '__main__': import ray import raydp ray.init() spark = raydp.init_spark("NYCTAXI data processing", num_executors=1, executor_cores=1, executor_memory='500M', configs={"spark.shuffle.service.enabled": "true"}) data = spark.read.format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load(NYC_TRAIN_CSV) # Set spark timezone for processing datetime spark.conf.set("spark.sql.session.timeZone", "UTC") # Transform the dataset data = nyc_taxi_preprocess(data)
def test_spark(ray_cluster): spark = raydp.init_spark("test", 1, 1, "500 M") result = spark.range(0, 10).count() assert result == 10 raydp.stop_spark()