def test_to_spatial_rdd_df_and_geom_field_name(self): spatial_df = self._create_spatial_point_table() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom") spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s") spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 121960 assert spatial_rdd.boundaryEnvelope == Envelope( -179.147236, 179.475569, -14.548699, 71.35513400000001)
def test_distance_join_result_to_dataframe(self): point_csv_df = self.spark.\ read.\ format("csv").\ option("delimiter", ",").\ option("header", "false").load( area_lm_point_input_location ) point_csv_df.createOrReplaceTempView("pointtable") point_df = self.spark.sql( "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable" ) point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark") point_rdd.analyze() polygon_wkt_df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").load( mixed_wkt_geometry_input_location ) polygon_wkt_df.createOrReplaceTempView("polygontable") polygon_df = self.spark.\ sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable") polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty") polygon_rdd.analyze() circle_rdd = CircleRDD(polygon_rdd, 0.2) point_rdd.spatialPartitioning(GridType.QUADTREE) circle_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.QUADTREE, True) join_result_pair_rdd = JoinQuery.\ DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True) join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark) join_result_df.printSchema() join_result_df.show()
def test_read_mixed_wkt_geometries_into_spatial_rdd(self): df = self.spark.read.format("csv").\ option("delimiter", "\t").\ option("header", "false").load(mixed_wkt_geometry_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_GeomFromWKT(inputtable._c0) as usacounty from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df) spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 1) Adapter.toDf(spatial_rdd, self.spark).show()
def test_read_csv_point_into_spatial_rdd(self): df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(area_lm_point_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show()
def test_to_spatial_rdd_df_geom_column_id(self): df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(mixed_wkt_geometry_input_location) df_shorter = df.select( col("_c0").alias("geom"), col("_c6").alias("county_name")) df_shorter.createOrReplaceTempView("county_data") spatial_df = self.spark.sql( "SELECT ST_GeomFromWKT(geom) as geom, county_name FROM county_data" ) spatial_df.show() spatial_rdd = Adapter.toSpatialRdd(spatial_df, ["geom", "county_name"]) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 100
def get_bdy_rdd(spark, bdy): # load boundaries from Postgres sql = """SELECT {}, name as {}, st_astext(geom) as wkt_geom FROM admin_bdys_202008.{}_analysis""".format( bdy["id_field"], bdy["name_field"], bdy["name"]) bdy_df = get_dataframe_from_postgres(spark, sql) # create geometries from WKT strings into new DataFrame bdy_df2 = bdy_df\ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create rdd output_rdd = Adapter.toSpatialRdd(bdy_df2, "geom") output_rdd.analyze() bdy_df2.unpersist() bdy_df.unpersist() return output_rdd