Пример #1
0
 def test_saving_to_disc_index_linestring(self, remove_spatial_rdd_disc_dir):
     from tests.properties.point_properties import input_location, offset, splitter, num_partitions
     point_rdd = PointRDD(
         self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY
     )
     point_rdd.buildIndex(IndexType.RTREE, False)
     point_rdd.indexedRawRDD.saveAsObjectFile(os.path.join(disc_object_location, "point_index"))
    def test_spatial_knn_correctness(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                             StorageLevel.MEMORY_ONLY, "epsg:4326",
                             "epsg:3005")
        result_no_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point,
                                                   top_k, False)
        point_rdd.buildIndex(IndexType.RTREE, False)
        result_with_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point,
                                                     top_k, True)

        sorted_result_no_index = sorted(
            result_no_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, query_point))

        sorted_result_with_index = sorted(
            result_with_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, query_point))

        difference = 0
        for x in range(top_k):
            difference += sorted_result_no_index[x].geom.distance(
                sorted_result_with_index[x].geom)

        assert difference == 0
Пример #3
0
    def test_spatial_knn_query_using_index(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, False)
        point_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(self.loop_times):
            result = KNNQuery.SpatialKnnQuery(point_rdd, self.query_point, self.top_k, False)
            assert result.__len__() > -1
            assert result[0].getUserData() is not None
 def test_build_index_without_set_grid(self):
     spatial_rdd = PointRDD(sparkContext=self.sc,
                            InputLocation=input_location,
                            Offset=offset,
                            splitter=splitter,
                            carryInputData=True,
                            partitions=num_partitions,
                            newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.buildIndex(IndexType.RTREE, False)
Пример #5
0
 def test_knn_query_with_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point,
                                           1000, True)
Пример #6
0
    def test_spatial_knn_query_using_index(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)
        object_rdd.buildIndex(point_rdd_index_type, False)
        object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point,
                                              1000, True)
    def test_spatial_knn_query_using_index(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                             StorageLevel.MEMORY_ONLY, "epsg:4326",
                             "epsg:3005")
        point_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(loop_times):
            result = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k,
                                              False)
            assert result.__len__() > 0
            assert result[0].getUserData() is not None
Пример #8
0
 def test_range_query_using_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                    range_query_window,
                                                    False, True).count
 def test_empty_constructor(self):
     spatial_rdd = PointRDD(sparkContext=self.sc,
                            InputLocation=input_location,
                            Offset=offset,
                            splitter=splitter,
                            carryInputData=True,
                            partitions=num_partitions,
                            newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.buildIndex(IndexType.RTREE, False)
     spatial_rdd_copy = PointRDD()
     spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD
     spatial_rdd_copy.analyze()
Пример #10
0
    def test_spatial_range_query_using_index(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)
        object_rdd.buildIndex(point_rdd_index_type, False)
        object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY)
        assert object_rdd.indexedRawRDD.is_cached

        for _ in range(each_query_loop_times):
            result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                       range_query_window,
                                                       False, True).count
Пример #11
0
 def test_crs_tranformed_spatial_range_query_using_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False,
                           newLevel=StorageLevel.DISK_ONLY,
                           sourceEpsgCRSCode="epsg:4326",
                           targetEpsgCode="epsg:3005")
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                    range_query_window,
                                                    False, True).count
Пример #12
0
    def test_spatial_range_query_using_index(self):
        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter,
                               False)

        spatial_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(self.loop_times):
            result_size = RangeQuery.\
                SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False)\
                .count()
            assert result_size == 2830
        assert RangeQuery.SpatialRangeQuery(
            spatial_rdd, self.query_envelope, False, False).take(10)[1].\
                   getUserData() is not None
    def test_spatial_range_query_using_index(self):
        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               StorageLevel.MEMORY_ONLY, "epsg:4326",
                               "epsg:3005")
        spatial_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(loop_times):
            result_size = RangeQuery.SpatialRangeQuery(spatial_rdd,
                                                       query_envelope, False,
                                                       False).count()
            assert result_size == 3127

        assert RangeQuery.SpatialRangeQuery(
            spatial_rdd, query_envelope, False,
            False).take(10)[1].getUserData() is not None
Пример #14
0
    def test_distance_join_query_using_index(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, True)

        for i in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count
Пример #15
0
    def test_spatial_join_using_index(self):
        query_window = PolygonRDD(self.sc, polygon_rdd_input_location,
                                  polygon_rdd_start_offset,
                                  polygon_rdd_end_offset, polygon_rdd_splitter,
                                  True)
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(point_rdd_index_type, True)

        for i in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window,
                                                     True, False).count()
Пример #16
0
    def test_spatial_join_using_index(self):
        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)

        object_rdd.spatialPartitioning(join_query_partitioning_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(point_rdd_index_type, True)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd,
                                                     query_window_rdd, True,
                                                     False).count()
Пример #17
0
    def test_distance_join_using_index(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)

        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, True)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)
        assert object_rdd.indexedRDD.is_cached
        assert query_window_rdd.spatialPartitionedRDD.is_cached

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
    def test_spatial_join_query_with_polygon_rdd_using_index(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd.spatialPartitioning(grid_type)

        spatial_rdd.buildIndex(IndexType.RTREE, True)

        query_rdd.spatialPartitioning(spatial_rdd.grids)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()

        assert result[1][0].getUserData() is not None

        for data in result:
            if data[1].__len__() != 0:
                for right_data in data[1]:
                    assert right_data.getUserData() is not None
Пример #19
0
    def test_indexed_rdd_assignment(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.count()
        object_rdd.indexedRDD.count()

        import time

        start = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
        diff = time.time() - start

        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        start1 = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
Пример #20
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # copy gnaf tables from Postgres to a CSV file - a one off
    #   - export required fields only and no header
    # ----------------------------------------------------------

    pg_conn = pg_pool.getconn()
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state
                 FROM gnaf_202008.{}
             ) TO STDOUT WITH CSV"""

    # address principals
    with open(gnaf_csv_file_path, 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)

    # append address aliases
    with open(gnaf_csv_file_path, 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_pool.putconn(pg_conn)

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    # upload Apache Sedona JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "8g").getOrCreate())

    # Register Apache Sedona UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (no effect on the "small" spatial join query in this script. Will improve bigger queries)
    # spark.conf.set("geospark.global.index", "true")
    # spark.conf.set("geospark.global.indextype", "rtree")
    # spark.conf.set("geospark.join.gridtype", "kdbtree")

    sc = spark.sparkContext

    logger.info("\t - PySpark {} session initiated: {}".format(
        sc.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t - GNAF RDD created: {}".format(datetime.now() -
                                                   start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        bdy_tag(spark, point_rdd, bdy)

    # point_rdd.unpersist()  # no such method on a SpatialRDD

    # ----------------------------------------------------------
    # merge boundary tag dataframes with GNAF records
    #   - required because spatial joins are INNER JOIN only,
    #     need to add untagged GNAF points
    # ----------------------------------------------------------

    start_time = datetime.now()

    # create gnaf dataframe and SQL view
    gnaf_df = spark.read \
        .option("header", False) \
        .option("inferSchema", True) \
        .csv(gnaf_csv_file_path) \
        .drop("_C0") \
        .drop("_C1") \
        .withColumnRenamed("_C2", "gnaf_pid") \
        .withColumnRenamed("_C3", "locality_pid") \
        .withColumnRenamed("_C4", "locality_name") \
        .withColumnRenamed("_C5", "postcode") \
        .withColumnRenamed("_C6", "state")
    # gnaf_df.printSchema()
    # gnaf_df.show(10, False)

    gnaf_df.createOrReplaceTempView("pnt")

    # add bdy tags, one bdy type at a time
    for bdy in bdy_list:
        gnaf_df = join_bdy_tags(spark, bdy)
        gnaf_df.createOrReplaceTempView("pnt")

    # # add point geoms for output to Postgres - in the PostGIS specific EWKT format
    # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \
    #     .drop("longitude") \
    #     .drop("latitude")
    # # final_df.printSchema()
    # # final_df.show(10, False)

    logger.info("\t - Boundary tags merged: {}".format(datetime.now() -
                                                       start_time))

    # output result to Postgres
    export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags",
                       os.path.join(output_path, "temp_gnaf_with_bdy_tags"),
                       True)

    # cleanup
    spark.stop()

    # delete intermediate bdy tag files and GNAF csv file
    for bdy in bdy_list:
        shutil.rmtree(
            os.path.join(output_path, "gnaf_with_{}".format(bdy["name"])))

    os.remove(gnaf_csv_file_path)