def test_point_serializer(self): data = [[1, Point(21.0, 56.0), Point(21.0, 59.0)]] schema = t.StructType([ t.StructField("id", IntegerType(), True), t.StructField("geom_from", GeometryType(), True), t.StructField("geom_to", GeometryType(), True) ]) self.spark.createDataFrame(data, schema).createOrReplaceTempView("points") distance = self.spark.sql( "select st_distance(geom_from, geom_to) from points").collect( )[0][0] assert distance == 3.0
def test_multipolygon_serialization(self): ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)] int = [(1, 1), (1, 1.5), (1.5, 1.5), (1.5, 1), (1, 1)] polygons = [ Polygon(ext, [int]), Polygon([[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]) ] multipolygon = MultiPolygon(polygons) data = [ [1, multipolygon] ] schema = t.StructType( [ t.StructField("id", IntegerType(), True), t.StructField("geom", GeometryType(), True) ] ) spark.createDataFrame( data, schema ).createOrReplaceTempView("polygon") length = spark.sql("select st_area(geom) from polygon").collect()[0][0] self.assertEqual(length, 4.75)
def test_point_rdd(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY) raw_spatial_rdd = spatial_rdd.rawSpatialRDD.map( lambda x: [x.geom, *x.getUserData().split("\t")]) self.spark.createDataFrame(raw_spatial_rdd).show() schema = StructType([ StructField("geom", GeometryType()), StructField("name", StringType()) ]) spatial_rdd_with_schema = self.spark.createDataFrame( raw_spatial_rdd, schema) spatial_rdd_with_schema.show() assert spatial_rdd_with_schema.take( 1)[0][0].wkt == "POINT (32.324142 -88.331492)"
def to_bytes(cls, geom: BaseGeometry) -> List[int]: from geo_pyspark.sql.types import GeometryType geom_name = str(geom.__class__.__name__).lower() try: appr_parser = PARSERS[geom_name] geom.__UDT__ = GeometryType() except KeyError: raise KeyError(f"Parser for geometry {geom_name} is not available") return appr_parser.serialize(geom, BinaryBuffer())
def test_multipoint_serializer(self): multipoint = MultiPoint([[21.0, 56.0], [21.0, 57.0]]) data = [[1, multipoint]] schema = t.StructType([ t.StructField("id", IntegerType(), True), t.StructField("geom", GeometryType(), True) ]) m_point_out = self.spark.createDataFrame(data, schema).collect()[0][1] assert m_point_out == multipoint
def test_list_to_rdd_and_df(self): point_data = [[Point(21, 52.0), "1", 1], [Point(22, 52.0), "2", 2], [Point(23.0, 52), "3", 3], [Point(23, 54), "4", 4], [Point(24.0, 56.0), "5", 5]] schema = StructType([ StructField("geom", GeometryType(), False), StructField("id_1", StringType(), False), StructField("id_2", IntegerType(), False), ]) rdd_data = self.spark.sparkContext.parallelize(point_data) df = self.spark.createDataFrame(rdd_data) df.show() df.printSchema()
def test_multilinestring_serialization(self): multilinestring = MultiLineString([[[0, 1], [1, 1]], [[2, 2], [3, 2]]]) data = [[1, multilinestring]] schema = t.StructType([ t.StructField("id", IntegerType(), True), t.StructField("geom", GeometryType(), True) ]) self.spark.createDataFrame( data, schema).createOrReplaceTempView("multilinestring") length = self.spark.sql( "select st_length(geom) from multilinestring").collect()[0][0] assert length == 2.0
def test_linestring_serialization(self): linestring = LineString([(0.0, 1.0), (1, 1), (12.0, 1.0)]) data = [[1, linestring]] schema = t.StructType([ t.StructField("id", IntegerType(), True), t.StructField("geom", GeometryType(), True) ]) self.spark.createDataFrame(data, schema).createOrReplaceTempView("line") length = self.spark.sql( "select st_length(geom) from line").collect()[0][0] assert length == 12.0
def test_polygon_serialization(self): ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)] int = [(1, 1), (1, 1.5), (1.5, 1.5), (1.5, 1), (1, 1)] polygon = Polygon(ext, [int]) data = [[1, polygon]] schema = t.StructType([ t.StructField("id", IntegerType(), True), t.StructField("geom", GeometryType(), True) ]) self.spark.createDataFrame(data, schema).createOrReplaceTempView("polygon") length = self.spark.sql( "select st_area(geom) from polygon").collect()[0][0] assert length == 3.75
def assign_udt_shapely_objects(geoms: List[type(BaseGeometry)]) -> bool: from geo_pyspark.sql.types import GeometryType for geom in geoms: geom.__UDT__ = GeometryType() return True