def test_saving_to_disc_index_linestring(self, remove_spatial_rdd_disc_dir): from tests.properties.point_properties import input_location, offset, splitter, num_partitions point_rdd = PointRDD( self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY ) point_rdd.buildIndex(IndexType.RTREE, False) point_rdd.indexedRawRDD.saveAsObjectFile(os.path.join(disc_object_location, "point_index"))
def test_spatial_knn_correctness(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") result_no_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) point_rdd.buildIndex(IndexType.RTREE, False) result_with_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, True) sorted_result_no_index = sorted( result_no_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) sorted_result_with_index = sorted( result_with_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) difference = 0 for x in range(top_k): difference += sorted_result_no_index[x].geom.distance( sorted_result_with_index[x].geom) assert difference == 0
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, False) point_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, self.query_point, self.top_k, False) assert result.__len__() > -1 assert result[0].getUserData() is not None
def test_build_index_without_set_grid(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_knn_query_with_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, True)
def test_spatial_knn_query_using_index(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.buildIndex(point_rdd_index_type, False) object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, True)
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") point_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) assert result.__len__() > 0 assert result[0].getUserData() is not None
def test_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_empty_constructor(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False) spatial_rdd_copy = PointRDD() spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD spatial_rdd_copy.analyze()
def test_spatial_range_query_using_index(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.buildIndex(point_rdd_index_type, False) object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) assert object_rdd.indexedRawRDD.is_cached for _ in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_crs_tranformed_spatial_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False, newLevel=StorageLevel.DISK_ONLY, sourceEpsgCRSCode="epsg:4326", targetEpsgCode="epsg:3005") object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, False) spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result_size = RangeQuery.\ SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False)\ .count() assert result_size == 2830 assert RangeQuery.SpatialRangeQuery( spatial_rdd, self.query_envelope, False, False).take(10)[1].\ getUserData() is not None
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result_size = RangeQuery.SpatialRangeQuery(spatial_rdd, query_envelope, False, False).count() assert result_size == 3127 assert RangeQuery.SpatialRangeQuery( spatial_rdd, query_envelope, False, False).take(10)[1].getUserData() is not None
def test_distance_join_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, True) for i in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count
def test_spatial_join_using_index(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(point_rdd_index_type, True) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_spatial_join_using_index(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.spatialPartitioning(join_query_partitioning_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(point_rdd_index_type, True) object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, True, False).count()
def test_distance_join_using_index(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, True) object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) assert object_rdd.indexedRDD.is_cached assert query_window_rdd.spatialPartitionedRDD.is_cached for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()
def test_spatial_join_query_with_polygon_rdd_using_index(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) query_rdd.spatialPartitioning(spatial_rdd.grids) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_indexed_rdd_assignment(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.count() object_rdd.indexedRDD.count() import time start = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count() diff = time.time() - start object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) start1 = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()
def main(): start_time = datetime.now() # ---------------------------------------------------------- # copy gnaf tables from Postgres to a CSV file - a one off # - export required fields only and no header # ---------------------------------------------------------- pg_conn = pg_pool.getconn() pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state FROM gnaf_202008.{} ) TO STDOUT WITH CSV""" # address principals with open(gnaf_csv_file_path, 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # append address aliases with open(gnaf_csv_file_path, 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_pool.putconn(pg_conn) logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- # upload Apache Sedona JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona UDTs and UDFs GeoSparkRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (no effect on the "small" spatial join query in this script. Will improve bigger queries) # spark.conf.set("geospark.global.index", "true") # spark.conf.set("geospark.global.indextype", "rtree") # spark.conf.set("geospark.join.gridtype", "kdbtree") sc = spark.sparkContext logger.info("\t - PySpark {} session initiated: {}".format( sc.version, datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: bdy_tag(spark, point_rdd, bdy) # point_rdd.unpersist() # no such method on a SpatialRDD # ---------------------------------------------------------- # merge boundary tag dataframes with GNAF records # - required because spatial joins are INNER JOIN only, # need to add untagged GNAF points # ---------------------------------------------------------- start_time = datetime.now() # create gnaf dataframe and SQL view gnaf_df = spark.read \ .option("header", False) \ .option("inferSchema", True) \ .csv(gnaf_csv_file_path) \ .drop("_C0") \ .drop("_C1") \ .withColumnRenamed("_C2", "gnaf_pid") \ .withColumnRenamed("_C3", "locality_pid") \ .withColumnRenamed("_C4", "locality_name") \ .withColumnRenamed("_C5", "postcode") \ .withColumnRenamed("_C6", "state") # gnaf_df.printSchema() # gnaf_df.show(10, False) gnaf_df.createOrReplaceTempView("pnt") # add bdy tags, one bdy type at a time for bdy in bdy_list: gnaf_df = join_bdy_tags(spark, bdy) gnaf_df.createOrReplaceTempView("pnt") # # add point geoms for output to Postgres - in the PostGIS specific EWKT format # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \ # .drop("longitude") \ # .drop("latitude") # # final_df.printSchema() # # final_df.show(10, False) logger.info("\t - Boundary tags merged: {}".format(datetime.now() - start_time)) # output result to Postgres export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags", os.path.join(output_path, "temp_gnaf_with_bdy_tags"), True) # cleanup spark.stop() # delete intermediate bdy tag files and GNAF csv file for bdy in bdy_list: shutil.rmtree( os.path.join(output_path, "gnaf_with_{}".format(bdy["name"]))) os.remove(gnaf_csv_file_path)