def draw_world_weighted_point_map(spark):
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "continent string, country string, locationId string, longitude double, latitude double,"
        "currentConfirmedCount int, confirmedCount int, suspectedCount int, curedCount int, deadCount int, "
        "updateTime timestamp").load(country_csv).cache()

    df.createOrReplaceTempView("COVID_country")

    register_funcs(spark)

    # 1
    res1 = spark.sql(
        "select ST_Point(longitude, latitude) as point from COVID_country ")
    res1.createOrReplaceTempView("res1")
    res1 = spark.sql("select * from res1 where point != 'POINT (nan nan)' ")
    res1.show(20, False)
    vega1 = vega_weighted_pointmap(
        3000, 2000, [-289.095983, -73.863121, 289.095983, 73.863121],
        "#EEEEEE", [2, 60], [6], 1.0, "EPSG:4326")
    res_png1 = weighted_pointmap(res1, vega1)
    save_png(res_png1, './COVID_country_weighted_point_map1.png')

    spark.catalog.dropGlobalTempView("COVID_country")
示例#2
0
def draw_heat_map(spark):
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")
    # df.createOrReplaceGlobalTempView("nyc_taxi")

    res = spark.sql(
        "select pickup_latitude as x, pickup_longitude as y, passenger_count as w from nyc_taxi"
    )
    res.printSchema()
    res.createOrReplaceTempView("pickup")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point, w from pickup"
    )
    res.show(20, False)
    res.createOrReplaceTempView("project")

    res = spark.sql(
        "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point, w from project"
    )
    res.show(20, False)

    vega_heat_map = VegaHeatMap(300, 200, 10.0)
    vega = vega_heat_map.build()
    res = heatmap(res, vega)
    save_png(res, '/tmp/heatmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#3
0
def draw_heat_map(spark):
    start_time = time.time()
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )

    res.show()

    vega = vega_heatmap(1024, 896,
                        [-73.998427, 40.730309, -73.954348, 40.780816], 10.0,
                        'EPSG:4326')
    res = heatmap(vega, res)
    save_png(res, '/tmp/heatmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
    print("--- %s seconds ---" % (time.time() - start_time))
示例#4
0
def draw_point_map(spark):
    # file 0_5M_nyc_build.csv is generated from New York taxi data and taxi zone shapefile. Data is available at the following URL: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load(data_path).cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")
    # df.createOrReplaceGlobalTempView("nyc_taxi")

    res = spark.sql(
        "select pickup_latitude as x, pickup_longitude as y  from nyc_taxi")
    res.printSchema()
    res.createOrReplaceTempView("pickup")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point from pickup"
    )
    res.show(20, False)
    res.createOrReplaceTempView("project")

    res = spark.sql(
        "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point from project"
    )
    res.show(20, False)

    vega_point_map = VegaCircle2d(1900, 1410, 3, "#2DEF4A", 0.5)
    vega = vega_point_map.build()
    res = pointmap(res, vega)
    save_png(res, '/tmp/pointmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#5
0
def draw_point_map(spark):
    # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")
    # df.createOrReplaceGlobalTempView("nyc_taxi")

    res = spark.sql(
        "select pickup_latitude as x, pickup_longitude as y  from nyc_taxi")
    res.printSchema()
    res.createOrReplaceTempView("pickup")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point from pickup"
    )
    res.show(20, False)
    res.createOrReplaceTempView("project")

    res = spark.sql(
        "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point from project"
    )
    res.show(20, False)

    vega_point_map = VegaCircle2d(1900, 1410, 3, "#2DEF4A", 0.5)
    vega = vega_point_map.build()
    res = pointmap(res, vega)
    save_png(res, '/tmp/pointmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#6
0
def draw_point_map(spark):
    start_time = time.time()
    # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )

    vega = vega_pointmap(
        1024,
        896,
        bounding_box=[-73.998427, 40.730309, -73.954348, 40.780816],
        point_size=3,
        point_color="#2DEF4A",
        opacity=0.5,
        coordinate_system="EPSG:4326")
    res = pointmap(vega, res)
    save_png(res, '/tmp/pointmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
    print("--- %s seconds ---" % (time.time() - start_time))
示例#7
0
 def _create_session(self):
     """
     clone new session
     """
     session = self.session.newSession()
     register_funcs(session)
     return session
示例#8
0
def draw_choropleth_map(spark):
    start_time = time.time()
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_GeomFromText(buildingtext_dropoff) as polygon, passenger_count as w from nyc_taxi where (buildingtext_dropoff!='')"
    )

    vega1 = vega_choroplethmap(
        1900,
        1410,
        bounding_box=[-73.994092, 40.753893, -73.977588, 40.759642],
        color_gradient=["#0000FF", "#FF0000"],
        color_bound=[2.5, 5],
        opacity=1.0,
        coordinate_system='EPSG:4326')
    res1 = choroplethmap(vega1, res)
    save_png(res1, '/tmp/choroplethmap1.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
    print("--- %s seconds ---" % (time.time() - start_time))
示例#9
0
    def __init__(self, db_config):
        envs = db_config['spark'].get('envs', None)
        if envs:  # for spark on yarn
            self._setup_driver_envs(envs)

        import uuid
        self._db_id = str(uuid.uuid1()).replace('-', '')
        self._db_name = db_config['db_name']
        self._db_type = 'spark'
        self._table_list = []

        print("init spark begin")
        import socket
        localhost_ip = socket.gethostbyname(socket.gethostname())
        _t = SparkSession.builder \
            .appName(db_config['spark']['app_name']) \
            .master(db_config['spark']['master-addr']) \
            .config('spark.driver.host', localhost_ip) \
            .config("spark.sql.execution.arrow.pyspark.enabled", "true")

        configs = db_config['spark'].get('configs', None)
        if configs:
            for key in configs:
                _v = configs.get(key)
                if _v:
                    print("spark config: {} = {}".format(key, _v))
                    _t = _t.config(key, _v)

        self.session = _t.getOrCreate()

        print("init spark done")
        register_funcs(self.session)
示例#10
0
 def run(sql):
     """
     submit sql to spark
     """
     session = INSTANCE.create_session()
     register_funcs(session)
     return session.sql(sql)
示例#11
0
def draw_world_include_province_weighted_point_map(spark):
    # 1
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "Province string, Country string, Longitude double, Latitude double, ConfirmedCount int,"
        "DeadCount int, CuredCount int, LastUpdateTime string").load(
            country_with_province_csv).cache()

    df.createOrReplaceTempView("COVID_country_province")

    register_funcs(spark)

    res2 = spark.sql(
        "select ST_Point(Longitude, Latitude) as point, ConfirmedCount as s from COVID_country_province "
        "where LastUpdateTime like '%03-29%'")
    res2.createOrReplaceTempView("res2")
    res2 = spark.sql("select * from res2 where point != 'POINT (nan nan)' ")
    vega2 = vega_weighted_pointmap(
        3000, 2000, [-289.095983, -73.863121, 289.095983, 73.863121],
        "#F0356D", [2, 60], [6, 60], 1.0, "EPSG:4326")
    res_png2 = weighted_pointmap(res2, vega2)
    save_png(res_png2, './COVID_country_weighted_point_map2.png')

    spark.catalog.dropGlobalTempView("COVID_country_province")
示例#12
0
def run_curve_z(spark):
    curve_z_df = spark.read.json("/tmp/z_curve.json").cache()
    curve_z_df.createOrReplaceTempView("curve_z")
    register_funcs(spark)
    hex_data = spark.sql("select my_plot(x, y) from curve_z").collect()[0][0]
    str_hex_data = str(hex_data)
    import binascii
    binary_string = binascii.unhexlify(str_hex_data)
    with open('/tmp/hex_curve_z.png', 'wb') as png:
        png.write(binary_string)
示例#13
0
def draw_weighted_point_map(spark):
    start_time = time.time()
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)

    # single color and single stroke width
    res1 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )
    vega1 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#87CEEB"],
        [0, 2], [5], 1.0, "EPSG:4326")
    res1 = weighted_pointmap(vega1, res1)
    save_png(res1, '/tmp/weighted_pointmap_0_0.png')

    # multiple color and single stroke width
    res2 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )
    vega2 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816],
        ["#0000FF", "#FF0000"], [0, 2], [5], 1.0, "EPSG:4326")
    res2 = weighted_pointmap(vega2, res2)
    save_png(res2, '/tmp/weighted_pointmap_1_0.png')

    # single color and multiple stroke width
    res3 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )
    vega3 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#87CEEB"],
        [0, 2], [0, 10], 1.0, "EPSG:4326")
    res3 = weighted_pointmap(vega3, res3)
    save_png(res3, '/tmp/weighted_pointmap_0_1.png')

    # multiple color and multiple stroke width
    res4 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))"
    )
    vega4 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816],
        ["#0000FF", "#FF0000"], [0, 2], [0, 10], 1.0, "EPSG:4326")
    res4 = weighted_pointmap(vega4, res4)
    save_png(res4, '/tmp/weighted_pointmap_1_1.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
    print("--- %s seconds ---" % (time.time() - start_time))
示例#14
0
 def __init__(self):
     self.session = SparkSession.builder \
         .appName("Arctern") \
         .master(config.INSTANCE.get("spark", "master-addr")) \
         .config("spark.executorEnv.PYSPARK_PYTHON",
                 config.INSTANCE.get("spark", "executor-python")
                 ) \
         .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
         .config("spark.databricks.session.share", "false") \
         .getOrCreate()
     register_funcs(self.session)
示例#15
0
def draw_weighted_point_map(spark):
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load(data_path).cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)

    # single color and single stroke width
    res1 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )
    vega1 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "#87CEEB",
        [0, 2], [5], 1.0, "EPSG:4326")
    res1 = weighted_pointmap(res1, vega1)
    save_png(res1, '/tmp/weighted_pointmap_0_0.png')

    # multiple color and single stroke width
    res2 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )
    vega2 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816],
        "blue_to_red", [0, 2], [5], 1.0, "EPSG:4326")
    res2 = weighted_pointmap(res2, vega2)
    save_png(res2, '/tmp/weighted_pointmap_1_0.png')

    # single color and multiple stroke width
    res3 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )
    vega3 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "#87CEEB",
        [0, 2], [0, 10], 1.0, "EPSG:4326")
    res3 = weighted_pointmap(res3, vega3)
    save_png(res3, '/tmp/weighted_pointmap_0_1.png')

    # multiple color and multiple stroke width
    res4 = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )
    vega4 = vega_weighted_pointmap(
        1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816],
        "blue_to_red", [0, 2], [0, 10], 1.0, "EPSG:4326")
    res4 = weighted_pointmap(res4, vega4)
    save_png(res4, '/tmp/weighted_pointmap_1_1.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#16
0
def run_test_plot(spark):
    register_funcs(spark)

    raw_data = []
    raw_data.extend([(0, 'polygon((0 0,0 1,1 1,1 0,0 0))')])
    raw_data.extend([(1, 'linestring(0 0,0 1,1 1,1 0,0 0)')])
    raw_data.extend([(2, 'point(2 2)')])

    wkt_collect = "GEOMETRYCOLLECTION(" \
                  "MULTIPOLYGON (((0 0,0 1,1 1,1 0,0 0)),((1 1,1 2,2 2,2 1,1 1)))," \
                  "POLYGON((3 3,3 4,4 4,4 3,3 3))," \
                  "LINESTRING(0 8,5 5,8 0)," \
                  "POINT(4 7)," \
                  "MULTILINESTRING ((1 1,1 2),(2 4,1 9,1 8))," \
                  "MULTIPOINT (6 8,5 7)" \
                  ")"
    raw_data.extend([(3, wkt_collect)])

    raw_schema = StructType([
        StructField('idx', LongType(), False),
        StructField('geo', StringType(), False)
    ])

    df = spark.createDataFrame(data=raw_data, schema=raw_schema)
    df.createOrReplaceTempView("geoms")
    df2 = spark.sql("select st_geomfromtext(geo) from geoms")

    # run baseline
    fig1, ax1 = plt.subplots()
    plot(ax1, df2)
    ax1.grid()
    baseline_png1 = png_path + "plot_test_1.png"
    fig1.savefig(baseline_png1)

    # run plot test
    fig2, ax2 = plt.subplots()
    plot(ax2, df2)
    ax2.grid()
    plot_test1 = png_path + "test_plot_test_1.png"
    fig2.savefig(plot_test1)

    spark.catalog.dropGlobalTempView("nyc_taxi")

    assert run_diff_png(baseline_png1, plot_test1)
示例#17
0
def draw_heat_map(spark):
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load(data_path).cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )

    vega = vega_heatmap(1024, 896, 10.0,
                        [-73.998427, 40.730309, -73.954348, 40.780816],
                        'EPSG:4326')
    res = heatmap(res, vega)
    save_png(res, '/tmp/heatmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#18
0
def draw_point_map(spark):
    # file 0_5M_nyc_build.csv is generated from New York taxi data and taxi zone shapefile. Data is available at the following URL: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string"
    ).load(data_path).cache()
    df.show(20, False)
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')"
    )

    vega = vega_pointmap(1024, 896,
                         [-73.998427, 40.730309, -73.954348, 40.780816], 3,
                         "#2DEF4A", 0.5, "EPSG:4326")
    res = pointmap(res, vega)
    save_png(res, '/tmp/pointmap.png')

    spark.sql("show tables").show()
    spark.catalog.dropGlobalTempView("nyc_taxi")
示例#19
0
def run_test_heat_map(spark):
    df = spark.read.format("csv").option("header", True).option("delimiter", ",").schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, "
        "trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, "
        "dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, "
        "buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string").load(
        file_path).cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude),  'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')")

    # 1 size:1024*896, map_scale: 10.0
    vega_1 = vega_heatmap(1024, 896, 10.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326')
    baseline1 = heatmap(res, vega_1)
    heat_map1_1 = heatmap(res, vega_1)
    heat_map1_2 = heatmap(res, vega_1)

    baseline_png1 = png_path + "heat_map_nyc_1.png"
    save_png(baseline1, baseline_png1)
    save_png(heat_map1_1, png_path + "test_heat_map_nyc_1-1.png")
    save_png(heat_map1_2, png_path + "test_heat_map_nyc_1-2.png")

    # 2 map_scale: 0.0
    vega_2 = vega_heatmap(1024, 896, 0.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326')
    baseline2 = heatmap(res, vega_2)
    heat_map2_1 = heatmap(res, vega_2)
    heat_map2_2 = heatmap(res, vega_2)

    baseline_png2 = png_path + "heat_map_nyc_2.png"
    save_png(baseline2, baseline_png2)
    save_png(heat_map2_1, png_path + "test_heat_map_nyc_2-1.png")
    save_png(heat_map2_2, png_path + "test_heat_map_nyc_2-2.png")

    # 3 map_scale: 12.0
    vega_3 = vega_heatmap(1024, 896, 12.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326')
    baseline3 = heatmap(res, vega_3)
    heat_map3_1 = heatmap(res, vega_3)
    heat_map3_2 = heatmap(res, vega_3)

    baseline_png3 = png_path + "heat_map_nyc_3.png"
    save_png(baseline3, baseline_png3)
    save_png(heat_map3_1, png_path + "test_heat_map_nyc_3-1.png")
    save_png(heat_map3_2, png_path + "test_heat_map_nyc_3-2.png")

    # 4 map_scale: 5.5
    vega_4 = vega_heatmap(1024, 896, 5.5, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326')
    baseline4 = heatmap(res, vega_4)
    heat_map4_1 = heatmap(res, vega_4)
    heat_map4_2 = heatmap(res, vega_4)

    baseline_png4 = png_path + "heat_map_nyc_4.png"
    save_png(baseline4, baseline_png4)
    save_png(heat_map4_1, png_path + "test_heat_map_nyc_4-1.png")
    save_png(heat_map4_2, png_path + "test_heat_map_nyc_4-2.png")

    # 5 size:200*200
    vega_5 = vega_heatmap(200, 200, 10.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326')
    baseline5 = heatmap(res, vega_5)
    heat_map5_1 = heatmap(res, vega_5)
    heat_map5_2 = heatmap(res, vega_5)

    baseline_png5 = png_path + "heat_map_nyc_5.png"
    save_png(baseline5, baseline_png5)
    save_png(heat_map5_1, png_path + "test_heat_map_nyc_5-1.png")
    save_png(heat_map5_2, png_path + "test_heat_map_nyc_5-2.png")

    spark.catalog.dropGlobalTempView("nyc_taxi")

    assert run_diff_png(baseline_png1, png_path + "test_heat_map_nyc_1-1.png", 0.1)
    assert run_diff_png(baseline_png1, png_path + "test_heat_map_nyc_1-2.png", 0.1)
    assert run_diff_png(baseline_png2, png_path + "test_heat_map_nyc_2-1.png", 0.1)
    assert run_diff_png(baseline_png2, png_path + "test_heat_map_nyc_2-2.png", 0.1)
    assert run_diff_png(baseline_png3, png_path + "test_heat_map_nyc_3-1.png", 0.15)
    assert run_diff_png(baseline_png3, png_path + "test_heat_map_nyc_3-2.png", 0.15)
    assert run_diff_png(baseline_png4, png_path + "test_heat_map_nyc_4-1.png", 0.1)
    assert run_diff_png(baseline_png4, png_path + "test_heat_map_nyc_4-2.png", 0.1)
    assert run_diff_png(baseline_png5, png_path + "test_heat_map_nyc_5-1.png", 0.2)
    assert run_diff_png(baseline_png5, png_path + "test_heat_map_nyc_5-2.png", 0.2)
示例#20
0
def run_st_intersection(spark):
    test_df = spark.read.json("/tmp/intersection.json").cache()
    test_df.createOrReplaceTempView("intersection")
    register_funcs(spark)
    spark.sql("select ST_Intersection_UDF(left, right) from intersection").show()
示例#21
0
    #df.show()
    df.createOrReplaceTempView(table_name)

    rs = spark.sql(sql).cache()
    #rs.printSchema()
    #rs.show()
    save_result("results/%s" % table_name, rs)

if __name__ == "__main__":

    url = 'local'
    spark_session = SparkSession.builder.appName("Python zgis sample").master(url).getOrCreate()
    spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

    clear_result_dir('/tmp/results')
    register_funcs(spark_session)

    run_test_st_geomfromgeojson(spark_session)
    run_test_st_geomfromgeojson2(spark_session)
    run_test_st_curvetoline(spark_session)
    run_test_st_point(spark_session)
    run_test_envelope_aggr_1(spark_session)
    run_test_envelope_aggr_curve(spark_session)
    run_test_envelope_aggr_2(spark_session)
    run_test_union_aggr_2(spark_session)
    run_test_union_aggr_curve(spark_session)
    run_test_st_isvalid_1(spark_session)
    run_test_st_isvalid_curve(spark_session)
    run_test_st_intersection(spark_session)
    run_test_st_intersection_curve(spark_session)
    run_test_st_convexhull(spark_session)
示例#22
0
def run_st_point(spark):
    points_df = spark.read.json("/tmp/points.json").cache()
    points_df.createOrReplaceTempView("points")
    register_funcs(spark)
    spark.sql("select ST_Point(x, y) from points").show()
示例#23
0
def run_test_point_map(spark):
    # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial
    # file 0_10000_nyc_taxi_and_building.csv is from file 0_5M_nyc_taxi_and_building.csv first 10000 lines
    df = spark.read.format("csv").option("header", True).option("delimiter", ",").schema(
        "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, "
        "trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, "
        "dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, "
        "buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string").load(
        file_path).cache()
    df.createOrReplaceTempView("nyc_taxi")

    register_funcs(spark)
    res = spark.sql(
        "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')")

    # 1 size:1024*896, point_size: 3, opacity: 0.5, color: #2DEF4A(green)
    vega_1 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.5, "EPSG:4326")
    baseline1 = pointmap(res, vega_1)
    point_map1_1 = pointmap(res, vega_1)
    point_map1_2 = pointmap(res, vega_1)

    baseline_png1 = png_path + "point_map_nyc_1.png"
    save_png(baseline1, baseline_png1)
    save_png(point_map1_1, png_path + "test_point_map_nyc_1-1.png")
    save_png(point_map1_2, png_path + "test_point_map_nyc_1-2.png")

    # 2 #F50404(red)
    vega_2 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 5, "#F50404", 0.5, "EPSG:4326")
    baseline2 = pointmap(res, vega_2)
    point_map2_1 = pointmap(res, vega_2)
    point_map2_2 = pointmap(res, vega_2)

    baseline_png2 = png_path + "point_map_nyc_2.png"
    save_png(baseline2, baseline_png2)
    save_png(point_map2_1, png_path + "test_point_map_nyc_2-1.png")
    save_png(point_map2_2, png_path + "test_point_map_nyc_2-2.png")

    # 3 color: #1455EE(blue)
    vega_3 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 5, "#1455EE", 0.5, "EPSG:4326")
    baseline3 = pointmap(res, vega_3)
    point_map3_1 = pointmap(res, vega_3)
    point_map3_2 = pointmap(res, vega_3)

    baseline_png3 = png_path + "point_map_nyc_3.png"
    save_png(baseline3, baseline_png3)
    save_png(point_map3_1, png_path + "test_point_map_nyc_3-1.png")
    save_png(point_map3_2, png_path + "test_point_map_nyc_3-2.png")

    # 4 size:1024*896, point_size: 3, opacity: 1, color: #2DEF4A
    vega_4 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 1.0, "EPSG:4326")
    baseline4 = pointmap(res, vega_4)
    point_map4_1 = pointmap(res, vega_4)
    point_map4_2 = pointmap(res, vega_4)

    baseline_png4 = png_path + "point_map_nyc_4.png"
    save_png(baseline4, baseline_png4)
    save_png(point_map4_1, png_path + "test_point_map_nyc_4-1.png")
    save_png(point_map4_2, png_path + "test_point_map_nyc_4-2.png")

    # 5 size:1024*896, point_size: 3, opacity: 0, color: #2DEF4A
    vega_5 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.0, "EPSG:4326")
    baseline5 = pointmap(res, vega_5)
    point_map5_1 = pointmap(res, vega_5)
    point_map5_2 = pointmap(res, vega_5)

    baseline_png5 = png_path + "point_map_nyc_5.png"
    save_png(baseline5, baseline_png5)
    save_png(point_map5_1, png_path + "test_point_map_nyc_5-1.png")
    save_png(point_map5_2, png_path + "test_point_map_nyc_5-2.png")

    # 6 size:200*200, point_size: 3, opacity: 0.5, color: #2DEF4A
    vega_6 = vega_pointmap(200, 200, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.5, "EPSG:4326")
    baseline6 = pointmap(res, vega_6)
    point_map6_1 = pointmap(res, vega_6)
    point_map6_2 = pointmap(res, vega_6)

    baseline_png6 = png_path + "point_map_nyc_6.png"
    save_png(baseline6, baseline_png6)
    save_png(point_map6_1, png_path + "test_point_map_nyc_6-1.png")
    save_png(point_map6_2, png_path + "test_point_map_nyc_6-2.png")

    spark.catalog.dropGlobalTempView("nyc_taxi")

    assert run_diff_png(baseline_png1, png_path + "test_point_map_nyc_1-1.png")
    assert run_diff_png(baseline_png1, png_path + "test_point_map_nyc_1-2.png")
    assert run_diff_png(baseline_png2, png_path + "test_point_map_nyc_2-1.png")
    assert run_diff_png(baseline_png2, png_path + "test_point_map_nyc_2-2.png")
    assert run_diff_png(baseline_png3, png_path + "test_point_map_nyc_3-1.png")
    assert run_diff_png(baseline_png3, png_path + "test_point_map_nyc_3-2.png")
    assert run_diff_png(baseline_png4, png_path + "test_point_map_nyc_4-1.png")
    assert run_diff_png(baseline_png4, png_path + "test_point_map_nyc_4-2.png")
    assert run_diff_png(baseline_png5, png_path + "test_point_map_nyc_5-1.png")
    assert run_diff_png(baseline_png5, png_path + "test_point_map_nyc_5-2.png")
    assert run_diff_png(baseline_png6, png_path + "test_point_map_nyc_6-1.png")
    assert run_diff_png(baseline_png6, png_path + "test_point_map_nyc_6-2.png")
示例#24
0
    args = parse.parse_args()
    source_file = args.source_file[0]
    output_file = args.output_file[0]
    run_times = int(args.run_times[0])
    version_commit = args.version[0]

    user_module = importlib.import_module(
        "test_case." + (source_file.split(".")[0]).replace("/", "."),
        "test_case/" + source_file)
    spark = SparkSession \
        .builder \
        .appName("Python Arrow-in-Spark example") \
        .getOrCreate()
    spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

    register_funcs(spark)
    all_time_info = {
        "version": version_commit.split("-")[0],
        "commit_id": version_commit.split("-")[-1],
        "func_name": user_module.func_name
    }

    data_df = spark.read.format("csv").option("header", False).option(
        "delimiter",
        "|").schema(user_module.schema).load(user_module.csv_path).cache()
    data_df.createOrReplaceTempView(user_module.func_name)

    if hasattr(user_module, "spark_test"):
        for times in range(run_times):
            time_info = {}
            begin_time = time.time()
示例#25
0
def draw_china_weighted_point_map(spark):
    df = spark.read.format("csv").option("header", True).option(
        "delimiter", ","
    ).schema(
        "continent string, country string, province string, provinceLocationId string, "
        "provinceCurrentConfirmedCount int , provinceConfirmedCount int, provinceSuspectedCount int,"
        "provinceCuredCount int, provinceDeadCount int, cityName string, longitude double, latitude double,"
        "cityLocationId string, cityCurrentConfirmedCount int, cityConfirmedCount int, citySuspectedCount int,"
        "cityCuredCount int, cityDeadCount int, updateTime timestamp").load(
            china_csv).cache()

    spark.catalog.dropGlobalTempView("COVID_china")

    df.createOrReplaceTempView("COVID_china")

    register_funcs(spark)

    # 1
    res1 = spark.sql(
        "select ST_Point(longitude, latitude) as point from COVID_china where ST_Within(ST_Point(longitude, latitude), 'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533, 71.604264 53.808533, 71.604264 17.258977))')"
    )
    res1.createOrReplaceTempView("res1")
    res1 = spark.sql("select * from res1 where point != 'POINT (nan nan)' ")

    vega1 = vega_weighted_pointmap(
        1024, 896, [71.604264, 17.258977, 137.319408, 53.808533], "#EEEEEE",
        [2, 60], [6], 1.0, "EPSG:4326")
    res_png1 = weighted_pointmap(res1, vega1)
    save_png(res_png1, './COVID_china_weighted_point_map1.png')

    # 2
    res2 = spark.sql(
        "select ST_Point(longitude, latitude) as point, provinceConfirmedCount as c from COVID_china "
        "where ST_Within(ST_Point(longitude, latitude), "
        "'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533,"
        " 71.604264 53.808533, 71.604264 17.258977))')")

    res2.createOrReplaceTempView("res2")
    res2 = spark.sql("select * from res2 where point != 'POINT (nan nan)' ")

    vega2 = vega_weighted_pointmap(
        1024, 896, [71.604264, 17.258977, 137.319408, 53.808533],
        "blue_to_red", [2, 1000], [6], 1.0, "EPSG:4326")

    res_png2 = weighted_pointmap(res2, vega2)
    save_png(res_png2, './COVID_china_weighted_point_map2.png')

    # 3
    res3 = spark.sql(
        "select ST_Point(longitude, latitude) as point, provinceConfirmedCount as c, "
        "provinceConfirmedCount as s from COVID_china "
        "where ST_Within(ST_Point(longitude, latitude), "
        "'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533,"
        " 71.604264 53.808533, 71.604264 17.258977))')")
    res3.createOrReplaceTempView("res3")
    res3 = spark.sql("select * from res3 where point != 'POINT (nan nan)' ")

    vega3 = vega_weighted_pointmap(
        3000, 2000, [71.604264, 17.258977, 137.319408, 53.808533],
        "blue_to_red", [2, 1000], [5, 1000], 1.0, "EPSG:4326")

    res_png3 = weighted_pointmap(res3, vega3)
    save_png(res_png3, './COVID_china_weighted_point_map3.png')
    spark.catalog.dropGlobalTempView("COVID_china")