def _convert_by_udf(spark: sql.SparkSession, blocks: List[ObjectRef], locations: List[bytes], schema: StructType) -> DataFrame: holder = ray.get_actor(RAYDP_OBJ_HOLDER_NAME) df_id = uuid.uuid4() ray.get(holder.add_objects.remote(df_id, blocks)) jvm = spark.sparkContext._jvm object_store_reader = jvm.org.apache.spark.sql.raydp.ObjectStoreReader # create the rdd then dataframe to utilize locality jdf = object_store_reader.createRayObjectRefDF(spark._jsparkSession, locations) current_namespace = ray.get_runtime_context().namespace ray_address = ray.worker.global_worker.node.redis_address ray_password = ray.worker.global_worker.node.redis_password blocks_df = DataFrame(jdf, spark._wrapped) def _convert_blocks_to_dataframe(blocks): # connect to ray if not ray.is_initialized(): ray.init(address=ray_address, _redis_password=ray_password, namespace=current_namespace, logging_level=logging.WARN) obj_holder = ray.get_actor(RAYDP_OBJ_HOLDER_NAME) for block in blocks: dfs = [] for idx in block["idx"]: ref = ray.get(obj_holder.get_object.remote(df_id, idx)) data = ray.get(ref) dfs.append(data.to_pandas()) yield pd.concat(dfs) df = blocks_df.mapInPandas(_convert_blocks_to_dataframe, schema) return df