示例#1
0
def _convert_by_udf(spark: sql.SparkSession, blocks: List[ObjectRef],
                    locations: List[bytes], schema: StructType) -> DataFrame:
    holder = ray.get_actor(RAYDP_OBJ_HOLDER_NAME)
    df_id = uuid.uuid4()
    ray.get(holder.add_objects.remote(df_id, blocks))
    jvm = spark.sparkContext._jvm
    object_store_reader = jvm.org.apache.spark.sql.raydp.ObjectStoreReader
    # create the rdd then dataframe to utilize locality
    jdf = object_store_reader.createRayObjectRefDF(spark._jsparkSession,
                                                   locations)
    current_namespace = ray.get_runtime_context().namespace
    ray_address = ray.worker.global_worker.node.redis_address
    ray_password = ray.worker.global_worker.node.redis_password
    blocks_df = DataFrame(jdf, spark._wrapped)

    def _convert_blocks_to_dataframe(blocks):
        # connect to ray
        if not ray.is_initialized():
            ray.init(address=ray_address,
                     _redis_password=ray_password,
                     namespace=current_namespace,
                     logging_level=logging.WARN)
        obj_holder = ray.get_actor(RAYDP_OBJ_HOLDER_NAME)
        for block in blocks:
            dfs = []
            for idx in block["idx"]:
                ref = ray.get(obj_holder.get_object.remote(df_id, idx))
                data = ray.get(ref)
                dfs.append(data.to_pandas())
            yield pd.concat(dfs)

    df = blocks_df.mapInPandas(_convert_blocks_to_dataframe, schema)
    return df