def spark(self): if not hasattr(self, "__spark"): upload_jars() spark = SparkSession. \ builder. \ config("spark.serializer", KryoSerializer.getName).\ config("spark.kryo.registrator", GeoSparkKryoRegistrator.getName) .\ master("local[*]").\ getOrCreate() GeoSparkRegistrator.registerAll(spark) setattr(self, "__spark", spark) return getattr(self, "__spark")
def main(): start_time = datetime.now() # upload Sedona (geospark) JARs # theoretically only need to do this once upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 1).config( "spark.cores.max", num_processors).config( "spark.driver.memory", "8g").config("spark.driver.maxResultSize", "1g").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs GeoSparkRegistrator.registerAll(spark) # set Sedona spatial indexing and partitioning config in Spark session # (no effect on the "small" spatial join query in this script. Will improve bigger queries) spark.conf.set("geospark.global.index", "true") spark.conf.set("geospark.global.indextype", "rtree") spark.conf.set("geospark.join.gridtype", "kdbtree") logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # load boundaries (geometries are Well Known Text strings) bdy_wkt_df = spark.read.parquet(os.path.join(input_path, "boundaries")) # bdy_wkt_df.printSchema() # bdy_wkt_df.show(5) # create view to enable SQL queries bdy_wkt_df.createOrReplaceTempView("bdy_wkt") # create geometries from WKT strings into new DataFrame # new DF will be spatially indexed automatically bdy_df = spark.sql( "select bdy_id, st_geomFromWKT(wkt_geom) as geometry from bdy_wkt") # repartition and cache for performance (no effect on the "small" spatial join query here) # bdy_df.repartition(spark.sparkContext.defaultParallelism).cache().count() # bdy_df.printSchema() # bdy_df.show(5) # create view to enable SQL queries bdy_df.createOrReplaceTempView("bdy") logger.info("\t - Loaded and spatially enabled {:,} boundaries: {}".format( bdy_df.count(), datetime.now() - start_time)) start_time = datetime.now() # load points (spatial data is lat/long fields) point_wkt_df = spark.read.parquet(os.path.join(input_path, "points")) # point_wkt_df.printSchema() # point_wkt_df.show(5) # create view to enable SQL queries point_wkt_df.createOrReplaceTempView("point_wkt") # create geometries from lat/long fields into new DataFrame # new DF will be spatially indexed automatically sql = """select point_id, st_point(cast(longitude as decimal(9, 6)), cast(latitude as decimal(8, 6))) as geometry from point_wkt""" point_df = spark.sql(sql) # repartition and cache for performance (no effect on the "small" spatial join query here) # point_df.repartition(spark.sparkContext.defaultParallelism).cache().count() # point_df.printSchema() # point_df.show(5) # create view to enable SQL queries point_df.createOrReplaceTempView("pnt") logger.info("\t - Loaded and spatially enabled {:,} points: {}".format( point_df.count(), datetime.now() - start_time)) start_time = datetime.now() # run spatial join to boundary tag the points # notes: # - spatial partitions and indexes for join will be created automatically # - it's an inner join so point records could be lost sql = """SELECT pnt.point_id, bdy.bdy_id, pnt.geometry FROM pnt INNER JOIN bdy ON ST_Intersects(pnt.geometry, bdy.geometry)""" join_df = spark.sql(sql) # join_df.explain() # # output join DataFrame # join_df.write.option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(input_path, "output")) num_joined_points = join_df.count() join_df.printSchema() join_df.show(5) logger.info("\t - {:,} points were boundary tagged: {}".format( num_joined_points, datetime.now() - start_time)) # cleanup spark.stop()
def main(): start_time = datetime.now() # copy gnaf tables to CSV pg_conn = psycopg2.connect(local_pg_connect_string) pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, state FROM gnaf_202008.{} ) TO STDOUT WITH CSV""" # sql = """COPY ( # SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name, # lot_number, flat_number, level_number, number_first, number_last, street_name, street_type, # street_suffix, address, locality_name, postcode, state, locality_postcode, confidence, # legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability # FROM gnaf_202008.{} # ) TO STDOUT WITH CSV""" # address principals with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file) # address aliases with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_conn.close() logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", cpu_count()).config("spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs GeoSparkRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # load gnaf points df = spark.read \ .option("header", True) \ .option("inferSchema", True) \ .csv(input_file_name) # df.printSchema() # df.show() # # manually assign field types (not needed here as inferSchema works) # df2 = (df # .withColumn("confidence", df.confidence.cast(t.ShortType())) # .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType())) # .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType())) # .withColumn("reliability", df.reliability.cast(t.ShortType())) # .withColumn("longitude", df.longitude.cast(t.DoubleType())) # .withColumn("latitude", df.latitude.cast(t.DoubleType())) # ) # # df2.printSchema() # # df2.show() # add point geometries and partition by longitude into 400-500k row partitions gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)")) # .withColumnRenamed("gnaf_pid", "id") # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0)) # .cast(t.ShortType())) \ # .repartitionByRange(100, "partition_id") \ # gnaf_df.printSchema() # check partition counts gnaf_df.groupBy(f.spark_partition_id()).count().show() # write gnaf to gzipped parquet export_to_parquet(gnaf_df, "gnaf") # export PG boundary tables to parquet export_bdys(spark, "commonwealth_electorates", "ce_pid") export_bdys(spark, "local_government_areas", "lga_pid") export_bdys(spark, "local_government_wards", "ward_pid") export_bdys(spark, "state_lower_house_electorates", "se_lower_pid") export_bdys(spark, "state_upper_house_electorates", "se_upper_pid") # cleanup spark.stop() logger.info( "\t - GNAF and boundaries exported to gzipped parquet files: {}". format(datetime.now() - start_time))
import string, sys, re import pandas as pd import geopandas as gpd from pyspark.sql.types import * from pyspark.sql import SparkSession from geospark.register import upload_jars from geospark.register import GeoSparkRegistrator # Create Spark Session spark = SparkSession.builder.\ appName("SparkSessionExample").\ getOrCreate() # Uses findspark Python package to upload jar files to executor and nodes. upload_jars() # Registers all GeoSparkSQL functions GeoSparkRegistrator.registerAll(spark) # Load matrix of coordinates and US county data into Spark and GeoPandas original_matrix_df = spark.read.format("csv").option("header", "true").load("geospark_matrix.csv") original_geo_df = gpd.read_file("cb_2018_us_county_500k/cb_2018_us_county_500k.shp") # Map Polygon in geometry field of geo_d fto WKT (well-known-text) format and rename as counties_df wkts = map(lambda g: str(g.to_wkt()), original_geo_df.geometry) original_geo_df['wkt'] = pd.Series(wkts) original_geo_df = original_geo_df.drop("geometry", axis=1) counties_df = spark.createDataFrame(original_geo_df) # Use Spark SQL to create new column location with each location as ST_POINT
def main(): start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "12g").getOrCreate()) # .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") # .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") # .config("spark.sql.autoBroadcastJoinThreshold", -1) # .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") # .config("spark.driver.maxResultSize", "1g") # # .config("spark.executor.cores", 1) # .config("spark.executor.memory", "2g") # Register Apache Sedona (geospark) UDTs and UDFs GeoSparkRegistrator.registerAll(spark) logger.info("PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) logger.info("\t - Running on Python {}".format( sys.version.replace("\n", " "))) start_time = datetime.now() # # load gzip csv files # df = spark.read.csv(input_file_name) # # df = spark.read.csv(os.path.join(output_path, "testing")) # # df = spark.read.csv(os.path.join(output_path, "sydney")) # # df.printSchema() # # df.show() # # # # create small dataset to speed testing up # # testing_df = df.filter(f.col("_c0").isin(vehicle_id_list)).cache() # # print(testing_df.count()) # # testing_df.repartition(1).write.option("compression", "gzip") \ # # .mode("overwrite") \ # # .csv(os.path.join(output_path, "testing")) # # # fix column types and names - for some unknown reason it's 3-4x faster than enforcing schema on load # df2 = (df.withColumnRenamed("_c0", "vehicle_id") # .withColumn("longitude", df["_c1"].cast(t.DoubleType())) # .withColumn("latitude", df["_c2"].cast(t.DoubleType())) # .withColumn("speed", df["_c3"].cast(t.DoubleType())) # .withColumn("bearing", df["_c4"].cast(t.DoubleType())) # .withColumn("time_utc", df["_c5"].cast(t.TimestampType())) # .withColumn("unix_time", df["_c6"].cast(t.IntegerType())) # .withColumn("geom", f.expr("st_point(longitude, latitude)")) # .drop("_c1") # .drop("_c2") # .drop("_c3") # .drop("_c4") # .drop("_c5") # .drop("_c6") # .repartition(f.to_date(f.col("time_utc"))) # ) # # df2.printSchema() # # df2.show(10, False) # # df2.write.option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(output_path, "step_1_schema_applied")) # # df.unpersist() # df2.unpersist() schema_df = spark.read.parquet( os.path.join(output_path, "step_1_schema_applied")) schema_df.createOrReplaceTempView("point") # # # get counts # # sql = """SELECT count(distinct vehicle_id) as unique_id_count, # # count(*) as point_count # # FROM point""" # # area_df = spark.sql(sql) # # area_df.show() # # logger.info("Step 1 : {} points loaded : {}".format(schema_df.count(), datetime.now() - start_time)) # # start_time = datetime.now() # -------------------------- # output stuff # -------------------------- # get_time_gap_stats(spark) export_trip_segments(spark) # export_small_area_data(spark) # export_single_id_data(spark) # export_trip_and_stop_data(spark) # -------------------------- # cleanup spark.stop() pg_pool.closeall()
def main(): start_time = datetime.now() # ---------------------------------------------------------- # copy gnaf tables from Postgres to a CSV file - a one off # - export required fields only and no header # ---------------------------------------------------------- pg_conn = pg_pool.getconn() pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state FROM gnaf_202008.{} ) TO STDOUT WITH CSV""" # address principals with open(gnaf_csv_file_path, 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # append address aliases with open(gnaf_csv_file_path, 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_pool.putconn(pg_conn) logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- # upload Apache Sedona JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona UDTs and UDFs GeoSparkRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (no effect on the "small" spatial join query in this script. Will improve bigger queries) # spark.conf.set("geospark.global.index", "true") # spark.conf.set("geospark.global.indextype", "rtree") # spark.conf.set("geospark.join.gridtype", "kdbtree") sc = spark.sparkContext logger.info("\t - PySpark {} session initiated: {}".format( sc.version, datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: bdy_tag(spark, point_rdd, bdy) # point_rdd.unpersist() # no such method on a SpatialRDD # ---------------------------------------------------------- # merge boundary tag dataframes with GNAF records # - required because spatial joins are INNER JOIN only, # need to add untagged GNAF points # ---------------------------------------------------------- start_time = datetime.now() # create gnaf dataframe and SQL view gnaf_df = spark.read \ .option("header", False) \ .option("inferSchema", True) \ .csv(gnaf_csv_file_path) \ .drop("_C0") \ .drop("_C1") \ .withColumnRenamed("_C2", "gnaf_pid") \ .withColumnRenamed("_C3", "locality_pid") \ .withColumnRenamed("_C4", "locality_name") \ .withColumnRenamed("_C5", "postcode") \ .withColumnRenamed("_C6", "state") # gnaf_df.printSchema() # gnaf_df.show(10, False) gnaf_df.createOrReplaceTempView("pnt") # add bdy tags, one bdy type at a time for bdy in bdy_list: gnaf_df = join_bdy_tags(spark, bdy) gnaf_df.createOrReplaceTempView("pnt") # # add point geoms for output to Postgres - in the PostGIS specific EWKT format # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \ # .drop("longitude") \ # .drop("latitude") # # final_df.printSchema() # # final_df.show(10, False) logger.info("\t - Boundary tags merged: {}".format(datetime.now() - start_time)) # output result to Postgres export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags", os.path.join(output_path, "temp_gnaf_with_bdy_tags"), True) # cleanup spark.stop() # delete intermediate bdy tag files and GNAF csv file for bdy in bdy_list: shutil.rmtree( os.path.join(output_path, "gnaf_with_{}".format(bdy["name"]))) os.remove(gnaf_csv_file_path)
def main(): start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs GeoSparkRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (slowed down the "small" spatial join query in this script. Might improve bigger queries) # spark.conf.set("geospark.global.index", "true") # spark.conf.set("geospark.global.indextype", "rtree") # spark.conf.set("geospark.join.gridtype", "kdbtree") # spark.conf.set("ggeospark.join.numpartition", "-1") # spark.conf.set("geospark.join.indexbuildside", "right") # spark.conf.set("geospark.join.spatitionside", "right") logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # # load gnaf points and create geoms # df = spark.read \ # .option("header", True) \ # .option("inferSchema", True) \ # .csv(input_file_name) # # point_df = df \ # .withColumn("geom", f.expr("ST_Point(longitude, latitude)")) \ # .cache() point_df = spark.read.parquet(os.path.join(output_path, "gnaf")).select( "gnaf_pid", "state", "geom") # point_df = gnaf_df.select("gnaf_pid", "state", "geom") # point_df = gnaf_df.select("gnaf_pid", "state", "longitude", "latitude", "geom")\ # .repartitionByRange(100, "longitude") point_df.createOrReplaceTempView("pnt") logger.info("\t - Loaded {:,} GNAF points: {}".format( point_df.count(), datetime.now() - start_time)) # boundary tag gnaf points bdy_tag(spark, "commonwealth_electorates", "ce_pid") point_df.unpersist() # tag_df.printSchema() point_df = spark.read.parquet( os.path.join(output_path, "gnaf_with_{}".format("commonwealth_electorates"))) # point_df.createOrReplaceTempView("pnt") # bdy_tag(spark, "local_government_areas", "lga_pid") # tag_df2.printSchema() # point_df.unpersist() # # point_df = spark.read.parquet(os.path.join(output_path, "gnaf_with_{}".format("local_government_areas"))) # # point_df.createOrReplaceTempView("pnt") # # # bdy_tag(spark, "local_government_wards", "ward_pid") # # bdy_tag(spark, "state_lower_house_electorates", "se_lower_pid") # # bdy_tag(spark, "state_upper_house_electorates", "se_upper_pid") # # bdy_ids = "ce_pid text, lga_pid text" # # final_df = point_df.withColumn("wkt_geom", f.expr("concat('SRID=4326;POINT (', st_x(geom), ' ', st_y(geom), ')')"))\ # .drop("geom") # # final_df.printSchema() # # # output to postgres, via CSV # table_name = "gnaf_with_bdy_tags" # export_to_postgres(final_df, "testing2.{}".format(table_name), bdy_ids, os.path.join(output_path, table_name)) # cleanup spark.stop()