Пример #1
0
    def test_creating_point_rdd(self):
        point_rdd = PointRDD(self.spark._sc, point_path, 4,
                             FileDataSplitter.WKT, True)

        point_rdd.analyze()
        cnt = point_rdd.countWithoutDuplicates()
        assert cnt == 12872, f"Point RDD should have 12872 but found {cnt}"
    def test_spatial_knn_correctness(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                             StorageLevel.MEMORY_ONLY, "epsg:4326",
                             "epsg:3005")
        result_no_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point,
                                                   top_k, False)
        point_rdd.buildIndex(IndexType.RTREE, False)
        result_with_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point,
                                                     top_k, True)

        sorted_result_no_index = sorted(
            result_no_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, query_point))

        sorted_result_with_index = sorted(
            result_with_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, query_point))

        difference = 0
        for x in range(top_k):
            difference += sorted_result_no_index[x].geom.distance(
                sorted_result_with_index[x].geom)

        assert difference == 0
Пример #3
0
 def test_build_index_without_set_grid(self):
     spatial_rdd = PointRDD(sparkContext=self.sc,
                            InputLocation=input_location,
                            Offset=offset,
                            splitter=splitter,
                            carryInputData=True,
                            partitions=num_partitions,
                            newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.buildIndex(IndexType.RTREE, False)
Пример #4
0
    def test_spatial_knn_query_using_index(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, False)
        point_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(self.loop_times):
            result = KNNQuery.SpatialKnnQuery(point_rdd, self.query_point,
                                              self.top_k, False)
            assert result.__len__() > -1
            assert result[0].getUserData() is not None
Пример #5
0
 def test_knn_query_with_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point,
                                           1000, True)
Пример #6
0
 def test_range_query_using_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                    range_query_window,
                                                    False, True).count
    def test_spatial_knn_query_using_index(self):
        point_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                             StorageLevel.MEMORY_ONLY, "epsg:4326",
                             "epsg:3005")
        point_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(loop_times):
            result = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k,
                                              False)
            assert result.__len__() > 0
            assert result[0].getUserData() is not None
Пример #8
0
    def test_point_rdd(self):
        point_rdd = PointRDD(sparkContext=self.sc,
                             InputLocation=point_rdd_input_location,
                             Offset=point_rdd_offset,
                             splitter=point_rdd_splitter,
                             carryInputData=False)

        collected_points = point_rdd.getRawSpatialRDD().collect()

        points_coordinates = [[-88.331492, 32.324142], [-88.175933, 32.360763],
                              [-88.388954, 32.357073], [-88.221102, 32.35078]]

        assert [[geo_data.geom.x, geo_data.geom.y]
                for geo_data in collected_points[:4]] == points_coordinates[:4]
Пример #9
0
 def test_crs_tranformed_spatial_range_query_using_index(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False,
                           newLevel=StorageLevel.DISK_ONLY,
                           sourceEpsgCRSCode="epsg:4326",
                           targetEpsgCode="epsg:3005")
     object_rdd.buildIndex(point_rdd_index_type, False)
     for i in range(each_query_loop_times):
         result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                    range_query_window,
                                                    False, True).count
Пример #10
0
    def readToPointRDD(cls, sc: SparkContext, inputPath: str) -> PointRDD:
        """

        :param sc:
        :param inputPath:
        :return:
        """
        ShapefileReader.validate_imports()
        jvm = sc._jvm
        jsc = sc._jsc
        srdd = jvm.ShapefileReader.readToPointRDD(jsc, inputPath)
        spatial_rdd = PointRDD()
        spatial_rdd.set_srdd(srdd)
        return spatial_rdd
Пример #11
0
    def test_spatial_range_query_using_index(self):
        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter,
                               False)

        spatial_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(self.loop_times):
            result_size = RangeQuery.\
                SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False)\
                .count()
            assert result_size == 2830
        assert RangeQuery.SpatialRangeQuery(
            spatial_rdd, self.query_envelope, False, False).take(10)[1].\
                   getUserData() is not None
Пример #12
0
    def test_crs_transform(self):
        spatial_rdd = PointRDD(
            sparkContext=self.sc,
            InputLocation=crs_test_point,
            Offset=0,
            splitter=splitter,
            carryInputData=True,
            partitions=numPartitions,
            newLevel=StorageLevel.MEMORY_ONLY
        )

        spatial_rdd.CRSTransform("epsg:4326", "epsg:3857")

        assert spatial_rdd.rawSpatialRDD.collect()[0].geom.wkt == "POINT (-9833016.710450118 3805934.914254189)"
Пример #13
0
    def test_get_crs_transformation(self):
        spatial_rdd = PointRDD(
            sparkContext=self.sc,
            InputLocation=crs_test_point,
            Offset=0,
            splitter=splitter,
            carryInputData=True,
            partitions=numPartitions,
            newLevel=StorageLevel.MEMORY_ONLY
        )

        assert not spatial_rdd.getCRStransformation()
        spatial_rdd.CRSTransform("epsg:4326", "epsg:3857")

        assert spatial_rdd.getCRStransformation()
    def test_spatial_range_query_using_index(self):
        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               StorageLevel.MEMORY_ONLY, "epsg:4326",
                               "epsg:3005")
        spatial_rdd.buildIndex(IndexType.RTREE, False)

        for i in range(loop_times):
            result_size = RangeQuery.SpatialRangeQuery(spatial_rdd,
                                                       query_envelope, False,
                                                       False).count()
            assert result_size == 3127

        assert RangeQuery.SpatialRangeQuery(
            spatial_rdd, query_envelope, False,
            False).take(10)[1].getUserData() is not None
Пример #15
0
 def test_empty_constructor(self):
     spatial_rdd = PointRDD(sparkContext=self.sc,
                            InputLocation=input_location,
                            Offset=offset,
                            splitter=splitter,
                            carryInputData=True,
                            partitions=num_partitions,
                            newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.buildIndex(IndexType.RTREE, False)
     spatial_rdd_copy = PointRDD()
     spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD
     spatial_rdd_copy.analyze()
Пример #16
0
    def test_get_source_epsg_code(self):
        spatial_rdd = PointRDD(
            sparkContext=self.sc,
            InputLocation=crs_test_point,
            Offset=0,
            splitter=splitter,
            carryInputData=True,
            partitions=numPartitions,
            newLevel=StorageLevel.MEMORY_ONLY
        )

        assert spatial_rdd.getSourceEpsgCode() == ""

        spatial_rdd.CRSTransform("epsg:4326", "epsg:3857")

        assert spatial_rdd.getSourceEpsgCode() == "epsg:4326"
    def test_point_rdd(self):
        spatial_rdd = PointRDD(sparkContext=self.sc,
                               InputLocation=crs_test_point,
                               Offset=0,
                               splitter=splitter,
                               carryInputData=True,
                               partitions=numPartitions,
                               newLevel=StorageLevel.MEMORY_ONLY)

        raw_spatial_rdd = spatial_rdd.rawSpatialRDD.map(
            lambda x: [x.geom, *x.getUserData().split("\t")])

        self.spark.createDataFrame(raw_spatial_rdd).show()

        schema = StructType([
            StructField("geom", GeometryType()),
            StructField("name", StringType())
        ])

        spatial_rdd_with_schema = self.spark.createDataFrame(
            raw_spatial_rdd, schema)

        spatial_rdd_with_schema.show()

        assert spatial_rdd_with_schema.take(
            1)[0][0].wkt == "POINT (32.324142 -88.331492)"
Пример #18
0
 def test_circle_rdd(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     circle_rdd = CircleRDD(object_rdd, 0.1)
     collected_data = circle_rdd.getRawSpatialRDD().collect()
     print([geo_data.geom.wkt for geo_data in collected_data])
Пример #19
0
 def create_spatial_rdd(self):
     spatial_rdd = PointRDD(
         sparkContext=self.sc,
         InputLocation=input_file_location,
         Offset=offset,
         splitter=splitter,
         carryInputData=True,
         partitions=numPartitions,
         newLevel=StorageLevel.MEMORY_ONLY
     )
     return spatial_rdd
    def test_spatial_join_query_with_polygon_rdd(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")
        spatial_rdd.spatialPartitioning(grid_type)

        query_rdd.spatialPartitioning(spatial_rdd.grids)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()
        assert result[1][0].getUserData() is not None

        for data in result:
            if data[1].__len__() != 0:
                for right_data in data[1]:
                    assert right_data.getUserData() is not None
Пример #21
0
    def test_distance_join_query_using_index(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, True)

        for i in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count
Пример #22
0
    def test_spatial_range_query(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)

        for i in range(each_query_loop_times):
            result_size = RangeQuery.SpatialRangeQuery(object_rdd,
                                                       range_query_window,
                                                       False, False).count()
            logging.info(result_size)
Пример #23
0
    def test_equal_partitioning(self):
        spatial_rdd = PointRDD(sparkContext=self.sc,
                               InputLocation=input_location,
                               Offset=offset,
                               splitter=splitter,
                               carryInputData=False,
                               partitions=10,
                               newLevel=StorageLevel.MEMORY_ONLY)
        spatial_rdd.analyze()
        spatial_rdd.spatialPartitioning(GridType.EQUALGRID)

        for envelope in spatial_rdd.grids:
            print("PointRDD spatial partitioning grids: " + str(envelope))
        assert spatial_rdd.countWithoutDuplicates(
        ) == spatial_rdd.countWithoutDuplicatesSPRDD()
Пример #24
0
    def test_spatial_join_using_index(self):
        query_window = PolygonRDD(self.sc, polygon_rdd_input_location,
                                  polygon_rdd_start_offset,
                                  polygon_rdd_end_offset, polygon_rdd_splitter,
                                  True)
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(point_rdd_index_type, True)

        for i in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window,
                                                     True, False).count()
Пример #25
0
    def test_r_tree_spatial_partitioning(self):
        spatial_rdd = PointRDD(sparkContext=self.sc,
                               InputLocation=input_location,
                               Offset=offset,
                               splitter=splitter,
                               carryInputData=True,
                               partitions=10,
                               newLevel=StorageLevel.MEMORY_ONLY)
        spatial_rdd.analyze()
        spatial_rdd.spatialPartitioning(GridType.RTREE)

        for envelope in spatial_rdd.grids:
            print(envelope)

        assert spatial_rdd.countWithoutDuplicates(
        ) == spatial_rdd.countWithoutDuplicatesSPRDD()
    def test_save_as_geo_json_with_data(self, remove_wkb_directory):
        spatial_rdd = PointRDD(
            sparkContext=self.sc,
            InputLocation=inputLocation,
            Offset=offset,
            splitter=splitter,
            carryInputData=True,
            partitions=numPartitions,
            newLevel=StorageLevel.MEMORY_ONLY
        )

        spatial_rdd.saveAsGeoJSON(test_save_as_wkb_with_data)

        result_wkb = PointRDD(
            sparkContext=self.sc,
            InputLocation=test_save_as_wkb_with_data,
            splitter=FileDataSplitter.GEOJSON,
            carryInputData=True,
            partitions=numPartitions,
            newLevel=StorageLevel.MEMORY_ONLY
        )

        assert result_wkb.rawSpatialRDD.count() == spatial_rdd.rawSpatialRDD.count()
Пример #27
0
 def test_empty_constructor_test(self):
     object_rdd = PointRDD(sparkContext=self.sc,
                           InputLocation=point_rdd_input_location,
                           Offset=point_rdd_offset,
                           splitter=point_rdd_splitter,
                           carryInputData=False)
     object_rdd_copy = PointRDD()
     object_rdd_copy.rawJvmSpatialRDD = object_rdd.rawJvmSpatialRDD
     object_rdd_copy.analyze()
    def test_on_boundary_point_join_correctness(self):
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PointRDD(
            self.sc.parallelize(self.test_on_boundary_point_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        self.verify_join_result(result_no_index)
    def test_outside_point_join_correctness(self):
        self.once_before_all()
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set),
                              StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        assert 0 == result_no_index.__len__()
Пример #30
0
    def test_spatial_join_query(self):
        point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT,
                             True)

        polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3,
                                 FileDataSplitter.WKT, True)

        point_rdd.analyze()
        point_rdd.spatialPartitioning(GridType.KDBTREE)
        polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())
        result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True,
                                            False)

        print(result.count())