Пример #1
0
    def test_distance_join_query(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, False,
                                                      True).count()
Пример #2
0
    def test_indexed_rdd_assignment(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.count()
        object_rdd.indexedRDD.count()

        import time

        start = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
        diff = time.time() - start

        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        start1 = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
Пример #3
0
    def test_distance_join_query(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        assert object_rdd.spatialPartitionedRDD.is_cached

        query_window_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, False,
                                                      True).count()
Пример #4
0
    def test_distance_join_result_to_dataframe(self):
        point_csv_df = self.spark.\
            read.\
            format("csv").\
            option("delimiter", ",").\
            option("header", "false").load(
                area_lm_point_input_location
        )
        point_csv_df.createOrReplaceTempView("pointtable")
        point_df = self.spark.sql(
            "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
        )

        point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark")
        point_rdd.analyze()

        polygon_wkt_df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(
                mixed_wkt_geometry_input_location
        )

        polygon_wkt_df.createOrReplaceTempView("polygontable")
        polygon_df = self.spark.\
            sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable")

        polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty")
        polygon_rdd.analyze()
        circle_rdd = CircleRDD(polygon_rdd, 0.2)

        point_rdd.spatialPartitioning(GridType.QUADTREE)
        circle_rdd.spatialPartitioning(point_rdd.getPartitioner())

        point_rdd.buildIndex(IndexType.QUADTREE, True)

        join_result_pair_rdd = JoinQuery.\
            DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True)

        join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark)
        join_result_df.printSchema()
        join_result_df.show()
    def test_polygon_distance_join_with_crs_transformation(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3857")
        window_rdd = CircleRDD(query_rdd, 0.1)

        object_rdd = PolygonRDD(self.sc, input_location_query_polygon,
                                splitter, True, num_partitions,
                                StorageLevel.MEMORY_ONLY, "epsg:4326",
                                "epsg:3857")

        object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4)
        object_rdd.spatialPartitioning(GridType.RTREE)
        object_rdd.buildIndex(IndexType.RTREE, True)
        window_rdd.spatialPartitioning(object_rdd.grids)

        results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                              False).collect()

        assert results.__len__() == 5467

        for data in results:
            for polygon_data in data[1]:
                assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)