def main(): start_time = datetime.now() # get AWS creds (for running Spark outside of AWS EMR) session = boto3.Session() credentials = session.get_credentials() aws_access_key = credentials.access_key aws_secret_key = credentials.secret_key # aws_session_token = credentials.token spark = ( SparkSession.builder.appName("gdelt_testing").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100) # .config("spark.hadoop.fs.s3.fast.upload", "true") # .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider") # .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider") .config("spark.hadoop.fs.s3a.access.key", aws_access_key).config( "spark.hadoop.fs.s3a.secret.key", aws_secret_key).config( "spark.sql.adaptive.enabled", "true").config("spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.driver.maxResultSize", "0").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs SedonaRegistrator.registerAll(spark) sc = spark.sparkContext logger.info("{} initiated on PySpark {} : {}".format( sc.applicationId, sc.version, datetime.now() - start_time)) logger.info("\t - Running on Python {}".format( sys.version.replace("\n", " "))) start_time = datetime.now() # load day dataframe and get stats day_df = spark.read.option("inferSchema", "true").csv(input_day_path) day_df.printSchema() day_df.show(5) # release the dataframes' memory day_df.unpersist() day_df.printSchema() # month_df = spark.read.parquet(input_month_path) # # year_df = spark.read.parquet(input_year_path) # cleanup spark.stop()
def spark(self): if not hasattr(self, "__spark"): spark = SparkSession. \ builder. \ config("spark.serializer", KryoSerializer.getName).\ config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\ master("local[*]").\ getOrCreate() SedonaRegistrator.registerAll(spark) setattr(self, "__spark", spark) return getattr(self, "__spark")
def register_iff_file_as_sql_table(self,filename, record_types=3, callsigns=None, chunksize=50000, encoding='latin-1',query_name=None): from sedona.register import SedonaRegistrator from pyspark.sql.types import IntegerType SedonaRegistrator.registerAll(self.sparkSession) iff_schema = self.iff_schema() df = self.sparkSession.read.csv(filename, header=False, sep=",", schema=iff_schema) cols = ['recType', 'recTime', 'callsign', 'latitude', 'longitude', 'altitude','heading'] df = df.select(*cols).filter(df['recType']==3).withColumn("recTime", df['recTime'].cast(IntegerType())) if query_name is not None: df.registerTempTable(query_name) return df.toPandas()
def __init__(self): from pyspark.sql import SparkSession from sedona.register import SedonaRegistrator from sedona.utils import SedonaKryoRegistrator, KryoSerializer sparkSession = SparkSession.\ builder.\ master("local[*]").\ appName("Sector_IFF_Parser").\ config("spark.serializer", KryoSerializer.getName).\ config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\ config('spark.jars.packages', 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.1.1-incubating,' 'org.datasyslab:geotools-wrapper:1.1.0-25.2'). \ getOrCreate() SedonaRegistrator.registerAll(sparkSession) self.sparkSession = sparkSession
def main(): start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- spark = ( SparkSession.builder.master( "local[*]").appName("gnaf-loader export").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 1).config( "spark.cores.max", num_processors).config("spark.driver.memory", "8g").config( "spark.driver.maxResultSize", "1g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) print("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # get row count df = spark.read.parquet(input_path) print("{} has {} rows : {}".format(input_path, df.count(), datetime.now() - start_time)) spark.stop()
def create_spark_session(): spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona UDTs and UDFs SedonaRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (no effect on the "small" spatial join query in this script. May improve bigger queries) # spark.conf.set("sedona.global.index", "true") # spark.conf.set("sedona.global.indextype", "rtree") # spark.conf.set("sedona.join.gridtype", "kdbtree") return spark
def run_test(test_name, num_partitions, max_vertices): # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join SQL Benchmark").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 4).config("spark.driver.memory", "8g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) # set Sedona spatial indexing and partitioning config in Spark session # (slowed down the "small" spatial join query in this script. Might improve bigger queries) spark.conf.set("sedona.global.index", "true") spark.conf.set("sedona.global.indextype", "rtree") spark.conf.set("sedona.join.gridtype", "kdbtree") spark.conf.set("sedona.join.numpartition", num_partitions) spark.conf.set("sedona.join.indexbuildside", "right") spark.conf.set("sedona.join.spatitionside", "right") start_time = datetime.now() # load gnaf points and create geoms point_df = (spark.read.parquet( os.path.join(input_path, "address_principals")).select( "gnaf_pid", "state", "geom").withColumnRenamed( "state", "gnaf_state").repartition(num_partitions, "gnaf_state")) point_df.createOrReplaceTempView("pnt") # load boundaries and create geoms if max_vertices is not None: bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices) else: bdy_vertex_name = bdy_name bdy_df = (spark.read.parquet(os.path.join( input_path, bdy_vertex_name)).select(bdy_id, "state", "geom").repartition(num_partitions, "state").cache()) bdy_count = bdy_df.count() bdy_df.createOrReplaceTempView("bdy") # run spatial join to boundary tag the points sql = """SELECT pnt.gnaf_pid, bdy.{}, bdy.state FROM bdy, pnt WHERE ST_Intersects(bdy.geom, pnt.geom)""" \ .format(bdy_id) join_df = spark.sql(sql) join_df2 = (join_df # .filter((join_df["state"] == join_df["gnaf_state"])) .dropDuplicates(["gnaf_pid", bdy_id]) .cache() ) # output to files if "warmup" in test_name: name = "gnaf_sql_{}_{}_{}".format(bdy_id, max_vertices, num_partitions) (join_df2.repartition(50).write.partitionBy("state").option( "compression", "gzip").mode("overwrite").parquet(os.path.join(output_path, name))) # output vars join_count = join_df2.count() time_taken = datetime.now() - start_time if "warmup" in test_name: print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) else: log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) # cleanup spark.stop()
from pyspark.sql import SparkSession from sedona.register import SedonaRegistrator from sedona.utils import SedonaKryoRegistrator, KryoSerializer spark = (SparkSession.builder.master("local[*]").appName( "Point Construction, Transform, and Distance Test").config( "spark.serializer", KryoSerializer.getName ).config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", "org.apache.sedona:sedona-python-adapter-2.4_2.11:1.0.0-incubating," "org.datasyslab:geotools-wrapper:geotools-24.0", ).getOrCreate()) SedonaRegistrator.registerAll(spark) temp = spark.sql(""" select ST_Point(-117.105397, 33.17972) as p1, ST_Transform(ST_Point(-117.105397, 33.17972), 'epsg:4326', 'epsg:5071', false) as p1t, ST_Transform(ST_FlipCoordinates(ST_Point(-117.105397, 33.17972)), 'epsg:4326', 'epsg:5071', false) as p1_flip_t """).toPandas() print(tabulate(temp, headers=temp.columns, tablefmt="github", showindex=False)) # temp = spark.sql( # """ # select # ST_Point(-117.105397, 33.17972) as p1, # ST_Point(-117.089177, 33.186309) as p2, # ST_Transform(ST_Point(-117.105397, 33.17972), 'epsg:4326', 'epsg:5071', false) as p1t,
def main(): start_time = datetime.now() # ---------------------------------------------------------- # copy gnaf tables from Postgres to a CSV file - a one off # - export required fields only and no header # ---------------------------------------------------------- pg_conn = pg_pool.getconn() pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state FROM gnaf_202008.{} ) TO STDOUT WITH CSV""" # address principals with open(gnaf_csv_file_path, 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # append address aliases with open(gnaf_csv_file_path, 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_pool.putconn(pg_conn) logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- # upload Apache Sedona JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona UDTs and UDFs SedonaRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (no effect on the "small" spatial join query in this script. Will improve bigger queries) # spark.conf.set("sedona.global.index", "true") # spark.conf.set("sedona.global.indextype", "rtree") # spark.conf.set("sedona.join.gridtype", "kdbtree") sc = spark.sparkContext logger.info("\t - PySpark {} session initiated: {}".format( sc.version, datetime.now() - start_time)) start_time = datetime.now() # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: bdy_tag(spark, point_rdd, bdy) # point_rdd.unpersist() # no such method on a SpatialRDD # ---------------------------------------------------------- # merge boundary tag dataframes with GNAF records # - required because spatial joins are INNER JOIN only, # need to add untagged GNAF points # ---------------------------------------------------------- start_time = datetime.now() # create gnaf dataframe and SQL view gnaf_df = spark.read \ .option("header", False) \ .option("inferSchema", True) \ .csv(gnaf_csv_file_path) \ .drop("_C0") \ .drop("_C1") \ .withColumnRenamed("_C2", "gnaf_pid") \ .withColumnRenamed("_C3", "locality_pid") \ .withColumnRenamed("_C4", "locality_name") \ .withColumnRenamed("_C5", "postcode") \ .withColumnRenamed("_C6", "state") # gnaf_df.printSchema() # gnaf_df.show(10, False) gnaf_df.createOrReplaceTempView("pnt") # add bdy tags, one bdy type at a time for bdy in bdy_list: gnaf_df = join_bdy_tags(spark, bdy) gnaf_df.createOrReplaceTempView("pnt") # # add point geoms for output to Postgres - in the PostGIS specific EWKT format # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \ # .drop("longitude") \ # .drop("latitude") # # final_df.printSchema() # # final_df.show(10, False) logger.info("\t - Boundary tags merged: {}".format(datetime.now() - start_time)) # output result to Postgres export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags", os.path.join(output_path, "temp_gnaf_with_bdy_tags"), True) # cleanup spark.stop() # delete intermediate bdy tag files and GNAF csv file for bdy in bdy_list: shutil.rmtree( os.path.join(output_path, "gnaf_with_{}".format(bdy["name"]))) os.remove(gnaf_csv_file_path)
def main(): start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- # upload Apache Sedona JARs upload_jars() spark = (SparkSession .builder .master("local[*]") .appName("query") .config("spark.sql.session.timeZone", "UTC") .config("spark.sql.debug.maxToStringFields", 100) .config("spark.serializer", KryoSerializer.getName) .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .config("spark.cores.max", num_processors) .config("spark.sql.adaptive.enabled", "true") .config("spark.driver.memory", "8g") .getOrCreate() ) # .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") # Register Apache Sedona UDTs and UDFs SedonaRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format(spark.sparkContext.version, datetime.now() - start_time)) # get list of tables to export to S3 pg_conn = psycopg2.connect(pg_connect_string) pg_cur = pg_conn.cursor() # -------------------------------------------------------------- # import each table from each schema in Postgres & # export to GZIPped Parquet files in AWS S3 # -------------------------------------------------------------- start_time = datetime.now() # check what type of geometry field the table has and what it's coordinate system is sql = """select f_geometry_column, type, srid from public.geometry_columns where f_table_schema = '{}' and f_table_name = '{}'""".format(schema_name, table_name) pg_cur.execute(sql) result = pg_cur.fetchone() if result is not None: geom_field = result[0] geom_type = result[1] geom_srid = result[2] else: geom_field = None geom_type = None geom_srid = None # build geom field sql # note: exported geom field will be WGS84 (EPSG:4326) Well Known Binaries (WKB) if geom_field is not None: if "POLYGON" in geom_type or "LINESTRING" in geom_type: geom_sql = ",ST_AsText(ST_Subdivide((ST_Dump(ST_Buffer(geom, 0.0))).geom, 512)) as wkt_geom" else: geom_sql = ",ST_AsText(geom) as wkt_geom" # transform geom to WGS84 if required if geom_srid != 4326: geom_sql = geom_sql.replace("(geom", "(ST_Transform(geom, 4326)") else: geom_sql = "" # build query to select all columns and the WKB geom if exists sql = """SELECT 'SELECT ' || array_to_string(ARRAY( SELECT column_name FROM information_schema.columns WHERE table_name = '{1}' AND table_schema = '{0}' AND column_name NOT IN('geom') ), ',') || '{2} ' || 'FROM {0}.{1}' AS sqlstmt"""\ .format(schema_name, table_name, geom_sql) pg_cur.execute(sql) query = str(pg_cur.fetchone()[0]) # str is just there for intellisense in Pycharm # add filter to query query = query + " WHERE tzid LIKE 'Australia%'" # print(query) bdy_df = import_bdys(spark, query) export_to_parquet(bdy_df, table_name) copy_to_s3(schema_name, table_name) bdy_df.unpersist() logger.info("\t\t exported {} : {}".format(table_name, datetime.now() - start_time)) # cleanup pg_cur.close() pg_conn.close() spark.stop()
def main(): start_time = datetime.now() # create spark session object spark = ( SparkSession.builder.master("local[*]").appName("Spatial Join Test") # .config("spark.sql.session.timeZone", "UTC") # .config("spark.sql.debug.maxToStringFields", 100) .config("spark.serializer", KryoSerializer.getName).config("spark.kryo.registrator", SedonaKryoRegistrator.getName) # .config("spark.jars.packages", # 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,' # 'org.datasyslab:geotools-wrapper:geotools-24.0') .config("spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 1).config( "spark.cores.max", num_processors).config("spark.driver.memory", "12g") # .config("spark.driver.maxResultSize", "2g") .getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) # # set Sedona spatial indexing and partitioning config in Spark session # # (slowed down the "small" spatial join query in this script. Might improve bigger queries) # spark.conf.set("sedona.global.index", "true") # spark.conf.set("sedona.global.indextype", "rtree") # spark.conf.set("sedona.join.gridtype", "kdbtree") # spark.conf.set("sedona.join.numpartition", "-1") # spark.conf.set("sedona.join.indexbuildside", "right") # spark.conf.set("sedona.join.spatitionside", "right") logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # # load gnaf points and create geoms # df = spark.read \ # .option("header", True) \ # .option("inferSchema", True) \ # .csv(input_file_name) # # point_df = df \ # .withColumn("geom", f.expr("ST_Point(longitude, latitude)")) \ # .cache() point_df = spark.read.parquet(os.path.join(input_path, "address_principals")) \ .select("gnaf_pid", "state", f.expr("ST_GeomFromWKT(wkt_geom)").alias("geom")) \ .repartition(192, "state") # point_df.printSchema() # point_df.show() point_df.createOrReplaceTempView("pnt") logger.info("\t - Loaded {:,} GNAF points: {}".format( point_df.count(), datetime.now() - start_time)) # boundary tag gnaf points bdy_tag(spark, "commonwealth_electorates_analysis", "ce_pid", 9) # point_df.unpersist() # tag_df.printSchema() # point_df = spark.read.parquet(os.path.join(input_path, "gnaf_with_{}".format("commonwealth_electorates"))) # point_df.createOrReplaceTempView("pnt") # bdy_tag(spark, "local_government_areas", "lga_pid") # tag_df2.printSchema() # point_df.unpersist() # # point_df = spark.read.parquet(os.path.join(input_path, "gnaf_with_{}".format("local_government_areas"))) # # point_df.createOrReplaceTempView("pnt") # # # bdy_tag(spark, "local_government_wards", "ward_pid") # # bdy_tag(spark, "state_lower_house_electorates", "se_lower_pid") # # bdy_tag(spark, "state_upper_house_electorates", "se_upper_pid") # # bdy_ids = "ce_pid text, lga_pid text" # # final_df = point_df.withColumn("wkt_geom", f.expr("concat('SRID=4326;POINT (', st_x(geom), ' ', st_y(geom), ')')"))\ # .drop("geom") # # final_df.printSchema() # # # output to postgres, via CSV # table_name = "gnaf_with_bdy_tags" # export_to_postgres(final_df, "testing2.{}".format(table_name), bdy_ids, os.path.join(output_path, table_name)) # cleanup spark.stop()
def run_test(test_name, num_partitions, max_vertices): # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join SQL Benchmark").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 4).config("spark.driver.memory", "8g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) start_time = datetime.now() # load gnaf points and create geoms point_df = (spark.read.parquet( os.path.join(input_path, "address_principals")).select( "gnaf_pid", "state", "geom").withColumnRenamed( "state", "gnaf_state").repartition(num_partitions, "gnaf_state")) # load boundaries and create geoms if max_vertices is not None: bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices) else: bdy_vertex_name = bdy_name bdy_df = (spark.read.parquet(os.path.join( input_path, bdy_vertex_name)).select(bdy_id, "state", "geom").repartition(num_partitions, "state").cache()) bdy_count = bdy_df.count() # create RDDs - analysed partitioned and indexed point_rdd = Adapter.toSpatialRdd(point_df, "geom") bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") point_df.unpersist() bdy_df.unpersist() point_rdd.analyze() bdy_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.RTREE, True) bdy_rdd.buildIndex(IndexType.RTREE, True) # run join query join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True, True) # convert SedonaPairRDD to dataframe join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames, point_rdd.fieldNames, spark) # join_df.printSchema() # | -- leftgeometry: geometry(nullable=true) # | -- <bdy_id>: string(nullable=true) # | -- state: string(nullable=true) # | -- rightgeometry: geometry(nullable=true) # | -- gnaf_pid: string(nullable=true) # | -- gnaf_state: string(nullable=true) join_df2 = (join_df # .filter((join_df["state"] == join_df["gnaf_state"])) .select("gnaf_pid", bdy_id, "state") .dropDuplicates(["gnaf_pid", bdy_id]) .cache() ) # output to files if "warmup" in test_name: name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions) (join_df2.repartition(50).write.partitionBy("state").option( "compression", "gzip").mode("overwrite").parquet(os.path.join(output_path, name))) # output vars join_count = join_df2.count() time_taken = datetime.now() - start_time if "warmup" in test_name: print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) else: log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) # cleanup spark.stop()
def main(): start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = ( SparkSession.builder.appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100). config("spark.hadoop.fs.s3.fast.upload", "true").config( "spark.sql.adaptive.enabled", "true") # TODO: does this split one ID into 2 or more partitions? .config("spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs SedonaRegistrator.registerAll(spark) sc = spark.sparkContext sc.setCheckpointDir("hdfs:///checkpoints") logger.info("{} initiated on PySpark {} : {}".format( sc.applicationId, sc.version, datetime.now() - start_time)) logger.info("\t - Running on Python {}".format( sys.version.replace("\n", " "))) start_time = datetime.now() # ----------------------------------------------------------------------------------------- # 1. vehicle point counts with min/max times # ----------------------------------------------------------------------------------------- logger.info("1. Counts per ID") df = spark.read.parquet(point_source_s3_path) logger.info("\t - {:,} points".format(df.count())) count_df = df.groupBy("uid", "src") \ .agg(f.count("*").alias("point_count"), f.min("time_utc").alias("min_time_utc"), f.max("time_utc").alias("max_time_utc") ) \ .withColumn("total_hours", f.round((f.col("max_time_utc").cast(t.LongType()) - f.col("min_time_utc").cast(t.LongType())).cast(t.DoubleType()) / 3600.0, 1)) logger.info("\t - {:,} unique IDs".format(count_df.count())) # ----------------------------------------------------------------------------------------- # add days active # ----------------------------------------------------------------------------------------- days_df = df.groupBy("uid", f.to_date("time_utc").alias("date_utc"))\ .agg(f.count("*").alias("point_count")) # only required to convert GroupedData object into a DataFrame logger.info("\t - {:,} ID days of data".format(days_df.count())) days_df2 = days_df.groupBy("uid") \ .agg(f.count("*").alias("active_day_count")) # days_df2.printSchema() # days_df2.show(20, False) # add data to counts count_df2 = count_df.join(days_df2, "uid", "inner") # ----------------------------------------------------------------------------------------- # add trip counts # ----------------------------------------------------------------------------------------- trip_df = spark.read.parquet(trip_source_s3_path) logger.info("\t - {:,} trips".format(trip_df.count())) trip_df2 = (trip_df.groupBy("uid").agg( f.count("*").alias("trip_count"), f.sum("point_count").alias("trip_point_count"), f.sum(f.round("distance_km", 3)).alias("trip_distance_km"), f.sum("duration_s").alias("temp_duration_s"), ).withColumn( "trip_hours", f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0, 1)).withColumn( "trip_avg_seconds_per_point", f.round( f.when( f.col("temp_duration_s") > 0, f.col("temp_duration_s") / f.col("trip_point_count").cast( t.DoubleType())).otherwise(None), 1)).withColumn( "trip_avg_speed", f.round( f.when( f.col("trip_distance_km") > 0, f.col("trip_distance_km") / f.col("trip_hours")).otherwise(None), 1)).drop("temp_duration_s")) # trip_df2.printSchema() # trip_df2.show(10, False) logger.info("\t - {:,} IDs with trips".format(trip_df2.count())) # add trip data to counts count_df3 = count_df2.join(trip_df2, "uid", "left") # ----------------------------------------------------------------------------------------- # add stop counts # ----------------------------------------------------------------------------------------- stop_df = spark.read.parquet(stop_source_s3_path) logger.info("\t - {:,} stops".format(stop_df.count())) stop_df2 = (stop_df.groupBy("uid").agg( f.count("*").alias("stop_count"), f.sum("point_count").alias("stop_point_count"), f.sum(f.round("distance_km", 3)).alias("stop_distance_km"), f.sum("duration_s").alias("temp_duration_s"), ).withColumn( "stop_hours", f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0, 1)).withColumn( "stop_avg_seconds_per_point", f.round( f.when( f.col("temp_duration_s") > 0, f.col("temp_duration_s") / f.col("stop_point_count").cast( t.DoubleType())).otherwise(None), 1)).withColumn( "stop_avg_speed", f.round( f.when( f.col("stop_distance_km") > 0, f.col("stop_distance_km") / f.col("stop_hours")).otherwise(None), 1)).drop("temp_duration_s")) # stop_df2.printSchema() # stop_df2.show(10, False) logger.info("\t - {:,} IDs with stops".format(stop_df2.count())) # add stop data to counts id_count_df = count_df3.join(stop_df2, "uid", "left") \ .checkpoint() # ----------------------------------------------------------------------------------------- # output counts # ----------------------------------------------------------------------------------------- (id_count_df.repartition(1).write.option("compression", "gzip").option( "header", "true").option("nullValue", None).mode("overwrite").csv( os.path.join(target_s3_path, "counts"))) # get IDs that have more than a fleeting amount of data # useful_id_df = id_count_df.filter() # id_count_df.unpersist() stop_df2.unpersist() count_df3.unpersist() trip_df2.unpersist() count_df2.unpersist() days_df2.unpersist() count_df.unpersist() df.unpersist() # ----------------------------------------------------------------------------------------- # 2. counts of GPS refresh rates # ----------------------------------------------------------------------------------------- logger.info("2. GPS refresh rates") final_point_df = spark.read.parquet(final_point_source_s3_path) final_point_df.createOrReplaceTempView("pnt") sql = """SELECT src, count(*) as point_count, SUM(CASE WHEN next_interval <= 5 THEN 1 ELSE 0 END) as _05_seconds, SUM(CASE WHEN next_interval <= 10 THEN 1 ELSE 0 END) as _10_seconds, SUM(CASE WHEN next_interval <= 15 THEN 1 ELSE 0 END) as _15_seconds, SUM(CASE WHEN next_interval <= 20 THEN 1 ELSE 0 END) as _20_seconds, SUM(CASE WHEN next_interval <= 30 THEN 1 ELSE 0 END) as _30_seconds, SUM(CASE WHEN next_interval <= 45 THEN 1 ELSE 0 END) as _45_seconds, SUM(CASE WHEN next_interval <= 60 THEN 1 ELSE 0 END) as _60_seconds, SUM(CASE WHEN next_interval <= 120 THEN 1 ELSE 0 END) as _120_seconds, SUM(CASE WHEN next_interval > 120 THEN 1 ELSE 0 END) as _over_120_seconds FROM pnt GROUP BY src""" refresh_rate_df = spark.sql(sql) refresh_rate_df.orderBy("src").show(20, False) refresh_rate_df.unpersist() final_point_df.unpersist() # ----------------------------------------------------------------------------------------- # 3. trips per day # ----------------------------------------------------------------------------------------- logger.info("3. Trips & Stops per day") trip_count_df = ( trip_df.withColumn("date_local", f.col("start_time_local").cast(t.DateType())). groupBy("src", f.col("date_local")).agg( f.count("*").alias("trip_count"), # f.count("distinct uid").alias("trip_id_count_df"), f.sum("point_count").alias("trip_point_count"), f.sum("distance_km").alias("trip_distance_km"), f.sum("duration_s").alias("temp_duration_s"), ).withColumn( "trip_hours", f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0, 1)).withColumn( "trip_avg_seconds_per_point", f.round( f.when( f.col("temp_duration_s") > 0, f.col("temp_duration_s") / f.col("trip_point_count").cast( t.DoubleType())).otherwise(None), 1)).withColumn( "trip_avg_speed", f.round( f.when( f.col("trip_distance_km") > 0, f.col("trip_distance_km") / f.col("trip_hours")).otherwise(None), 1)).withColumn( "trip_distance_km", f.round(f.col("trip_distance_km"), 3)).drop("temp_duration_s")) trip_df.unpersist() logger.info("\t - {:,} days of trips".format(trip_count_df.count())) stop_count_df = ( stop_df.withColumn("date_local", f.col("start_time_local").cast(t.DateType())). groupBy("src", f.col("date_local")).agg( f.count("*").alias("stop_count"), # f.count("distinct uid").alias("stop_id_count_df"), f.sum("point_count").alias("stop_point_count"), f.sum("distance_km").alias("stop_distance_km"), f.sum("duration_s").alias("temp_duration_s"), ).withColumn( "stop_hours", f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0, 1)).withColumn( "stop_avg_seconds_per_point", f.round( f.when( f.col("temp_duration_s") > 0, f.col("temp_duration_s") / f.col("stop_point_count").cast( t.DoubleType())).otherwise(None), 1)).withColumn( "stop_avg_speed", f.round( f.when( f.col("stop_distance_km") > 0, f.col("stop_distance_km") / f.col("stop_hours")).otherwise(None), 1)).withColumn( "stop_distance_km", f.round(f.col("stop_distance_km"), 3)).drop("temp_duration_s")) stop_df.unpersist() logger.info("\t - {:,} days of stops".format(stop_count_df.count())) # combine trip and stop counts trip_stop_count = trip_count_df.alias("left").join( stop_count_df.alias("right"), ["src", "date_local"], "full") # trip_stop_count.printSchema() # trip_stop_count.show(20, False) (trip_stop_count.repartition(1).write.option("compression", "gzip").option( "header", "true").option("nullValue", None).mode("overwrite").csv( os.path.join(target_s3_path, "daily_trip_stop_counts"))) # cleanup spark.stop()
def main(): start_time = datetime.now() # ---------------------------------------------------------- # create Spark session and context # ---------------------------------------------------------- spark = ( SparkSession.builder.master( "local[*]").appName("gnaf-loader export").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 1).config( "spark.cores.max", num_processors).config("spark.driver.memory", "8g").config( "spark.driver.maxResultSize", "1g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) # get list of tables to export to S3 pg_conn = psycopg2.connect(pg_connect_string) pg_cur = pg_conn.cursor() # -------------------------------------------------------------- # import each table from each schema in Postgres & # export to GZIPped Parquet files in AWS S3 # -------------------------------------------------------------- for schema_name in schema_names: i = 1 # get table list for schema sql = """SELECT table_name FROM information_schema.tables WHERE table_schema='{}' AND table_type='BASE TABLE' AND table_name <> 'qa' AND table_name NOT LIKE '%_2011_%' AND table_name NOT LIKE '%_analysis%' AND table_name NOT LIKE '%_display%'""".format(schema_name) pg_cur.execute(sql) tables = pg_cur.fetchall() logger.info("\t - {} schema : {} tables to export : {}".format( schema_name, len(tables), datetime.now() - start_time)) for table in tables: start_time = datetime.now() table_name = table[0] # check what type of geometry field the table has and what it's coordinate system is sql = """SELECT f_geometry_column, type, srid FROM public.geometry_columns WHERE f_table_schema = '{}' AND f_table_name = '{}'""".format( schema_name, table_name) pg_cur.execute(sql) result = pg_cur.fetchone() if result is not None: geom_field = result[0] geom_type = result[1] geom_srid = result[2] else: geom_field = None geom_type = None geom_srid = None # build geom field sql # note: exported geom field will be WGS84 (EPSG:4326) Well Known Binaries (WKB) if geom_field is not None: if "POLYGON" in geom_type or "LINESTRING" in geom_type: geom_sql = ",ST_AsText(ST_Subdivide((ST_Dump(ST_Buffer(geom, 0.0))).geom, 256)) as wkt_geom" else: geom_sql = ",ST_AsText(geom) as wkt_geom" # transform geom to WGS84 if required if geom_srid != 4326: geom_sql = geom_sql.replace("(geom", "(ST_Transform(geom, 4326)") else: geom_sql = "" # build query to select all columns and the WKB geom if exists sql = """SELECT 'SELECT ' || array_to_string(ARRAY( SELECT column_name FROM information_schema.columns WHERE table_name = '{1}' AND table_schema = '{0}' AND column_name NOT IN('geom') ), ',') || '{2} ' || 'FROM {0}.{1}' AS sqlstmt"""\ .format(schema_name, table_name, geom_sql) pg_cur.execute(sql) query = str(pg_cur.fetchone() [0]) # str is just there for intellisense in Pycharm # get min and max gid values to enable parallel import from Postgres to Spark # add gid field based on row number if missing if "gid," in query: sql = """SELECT min(gid), max(gid) FROM {}.{}""".format( schema_name, table_name) pg_cur.execute(sql) gid_range = pg_cur.fetchone() min_gid = gid_range[0] max_gid = gid_range[1] else: # get row count as the max gid value sql = """SELECT count(*) FROM {}.{}""".format( schema_name, table_name) pg_cur.execute(sql) min_gid = 1 max_gid = pg_cur.fetchone()[0] # add gid field to query query = query.replace("SELECT ", "SELECT row_number() OVER () AS gid,") # check table has records if max_gid is not None and max_gid > min_gid: bdy_df = import_table(spark, query, min_gid, max_gid, 100000) # bdy_df.printSchema() # # add geometry column if required # if geom_sql != "": # export_df = bdy_df.withColumn("geom", f.expr("ST_GeomFromWKT(wkt_geom)")) \ # .drop("wkt_geom") # else: # export_df = bdy_df # # export_to_parquet(export_df, table_name) export_to_parquet(bdy_df, table_name) # copy_to_s3(schema_name, table_name) bdy_df.unpersist() logger.info("\t\t {}. exported {} : {}".format( i, table_name, datetime.now() - start_time)) else: logger.warning("\t\t {}. {} has no records! : {}".format( i, table_name, datetime.now() - start_time)) i += 1 # cleanup pg_cur.close() pg_conn.close() spark.stop()
def main(): start_time = datetime.now() # copy gnaf tables to CSV pg_conn = psycopg2.connect(local_pg_connect_string) pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, state FROM gnaf_202011.{} ) TO STDOUT WITH CSV""" # sql = """COPY ( # SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name, # lot_number, flat_number, level_number, number_first, number_last, street_name, street_type, # street_suffix, address, locality_name, postcode, state, locality_postcode, confidence, # legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability # FROM gnaf_202011.{} # ) TO STDOUT WITH CSV""" # address principals with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file) # address aliases with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_conn.close() logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.cores.max", cpu_count()).config("spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs SedonaRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # load gnaf points df = spark.read \ .option("header", True) \ .option("inferSchema", True) \ .csv(input_file_name) # df.printSchema() # df.show() # # manually assign field types (not needed here as inferSchema works) # df2 = (df # .withColumn("confidence", df.confidence.cast(t.ShortType())) # .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType())) # .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType())) # .withColumn("reliability", df.reliability.cast(t.ShortType())) # .withColumn("longitude", df.longitude.cast(t.DoubleType())) # .withColumn("latitude", df.latitude.cast(t.DoubleType())) # ) # # df2.printSchema() # # df2.show() # add point geometries and partition by longitude into 400-500k row partitions gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)")) # .withColumnRenamed("gnaf_pid", "id") # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0)) # .cast(t.ShortType())) \ # .repartitionByRange(100, "partition_id") \ # gnaf_df.printSchema() # check partition counts gnaf_df.groupBy(f.spark_partition_id()).count().show() # write gnaf to gzipped parquet export_to_parquet(gnaf_df, "gnaf") # export PG boundary tables to parquet export_bdys(spark, "commonwealth_electorates", "ce_pid") export_bdys(spark, "local_government_areas", "lga_pid") export_bdys(spark, "local_government_wards", "ward_pid") export_bdys(spark, "state_lower_house_electorates", "se_lower_pid") export_bdys(spark, "state_upper_house_electorates", "se_upper_pid") # cleanup spark.stop() logger.info( "\t - GNAF and boundaries exported to gzipped parquet files: {}". format(datetime.now() - start_time))
def main(): start_time = datetime.now() # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join Test").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 1).config( "spark.cores.max", num_processors).config("spark.driver.memory", "8g").config( "spark.driver.maxResultSize", "1g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # load boundaries (geometries are Well Known Text strings) bdy_wkt_df = spark.read.parquet(os.path.join(input_path, "boundaries")) # bdy_wkt_df.printSchema() # bdy_wkt_df.show(5) # create view to enable SQL queries bdy_wkt_df.createOrReplaceTempView("bdy_wkt") # create geometries from WKT strings into new DataFrame # new DF will be spatially indexed automatically sql = "select bdy_id, state, ST_GeomFromWKT(wkt_geom) as geometry from bdy_wkt" bdy_df = spark.sql(sql).repartition(96, "state") # repartition and cache for performance (no effect on the "small" spatial join query here) # bdy_df.repartition(spark.sparkContext.defaultParallelism).cache().count() # bdy_df.printSchema() # bdy_df.show(5) # create view to enable SQL queries bdy_df.createOrReplaceTempView("bdy") logger.info("\t - Loaded and spatially enabled {:,} boundaries: {}".format( bdy_df.count(), datetime.now() - start_time)) start_time = datetime.now() # load points (spatial data is lat/long fields) point_wkt_df = spark.read.parquet(os.path.join(input_path, "points")) # point_wkt_df.printSchema() # point_wkt_df.show(5) # create view to enable SQL queries point_wkt_df.createOrReplaceTempView("point_wkt") # create geometries from lat/long fields into new DataFrame # new DF will be spatially indexed automatically sql = "select point_id, state, ST_Point(longitude, latitude) as geometry from point_wkt" point_df = spark.sql(sql).repartition(96, "state") # repartition and cache for performance (no effect on the "small" spatial join query here) # point_df.repartition(spark.sparkContext.defaultParallelism).cache().count() # point_df.printSchema() # point_df.show(5) # create view to enable SQL queries point_df.createOrReplaceTempView("pnt") logger.info("\t - Loaded and spatially enabled {:,} points: {}".format( point_df.count(), datetime.now() - start_time)) start_time = datetime.now() # run spatial join to boundary tag the points # notes: # - spatial partitions and indexes for join will be created automatically # - it's an inner join so point records could be lost sql = """SELECT pnt.point_id, bdy.bdy_id, bdy.state, pnt.geometry FROM pnt INNER JOIN bdy ON ST_Intersects(pnt.geometry, bdy.geometry)""" join_df = spark.sql(sql) # join_df.explain() # # output join DataFrame # join_df.write.option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(input_path, "output")) num_joined_points = join_df.count() join_df.printSchema() join_df.orderBy(f.rand()).show(5, False) logger.info("\t - {:,} points were boundary tagged: {}".format( num_joined_points, datetime.now() - start_time)) # cleanup spark.stop()
def main(): start_time = datetime.now() # upload Sedona (sedona) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.cores.max", num_processors).config( "spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "12g").getOrCreate()) # .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") # .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") # .config("spark.sql.autoBroadcastJoinThreshold", -1) # .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") # .config("spark.driver.maxResultSize", "1g") # # .config("spark.executor.cores", 1) # .config("spark.executor.memory", "2g") # Register Apache Sedona (geospark) UDTs and UDFs SedonaRegistrator.registerAll(spark) logger.info("PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) logger.info("\t - Running on Python {}".format( sys.version.replace("\n", " "))) start_time = datetime.now() # # load gzip csv files # df = spark.read.csv(input_file_name) # # df = spark.read.csv(os.path.join(output_path, "testing")) # # df = spark.read.csv(os.path.join(output_path, "sydney")) # # df.printSchema() # # df.show() # # # # create small dataset to speed testing up # # testing_df = df.filter(f.col("_c0").isin(vehicle_id_list)).cache() # # print(testing_df.count()) # # testing_df.repartition(1).write.option("compression", "gzip") \ # # .mode("overwrite") \ # # .csv(os.path.join(output_path, "testing")) # # # fix column types and names - for some unknown reason it's 3-4x faster than enforcing schema on load # df2 = (df.withColumnRenamed("_c0", "vehicle_id") # .withColumn("longitude", df["_c1"].cast(t.DoubleType())) # .withColumn("latitude", df["_c2"].cast(t.DoubleType())) # .withColumn("speed", df["_c3"].cast(t.DoubleType())) # .withColumn("bearing", df["_c4"].cast(t.DoubleType())) # .withColumn("time_utc", df["_c5"].cast(t.TimestampType())) # .withColumn("unix_time", df["_c6"].cast(t.IntegerType())) # .withColumn("geom", f.expr("st_point(longitude, latitude)")) # .drop("_c1") # .drop("_c2") # .drop("_c3") # .drop("_c4") # .drop("_c5") # .drop("_c6") # .repartition(f.to_date(f.col("time_utc"))) # ) # # df2.printSchema() # # df2.show(10, False) # # df2.write.option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(output_path, "step_1_schema_applied")) # # df.unpersist() # df2.unpersist() schema_df = spark.read.parquet( os.path.join(output_path, "step_1_schema_applied")) schema_df.createOrReplaceTempView("point") # # # get counts # # sql = """SELECT count(distinct vehicle_id) as unique_id_count, # # count(*) as point_count # # FROM point""" # # area_df = spark.sql(sql) # # area_df.show() # # logger.info("Step 1 : {} points loaded : {}".format(schema_df.count(), datetime.now() - start_time)) # # start_time = datetime.now() # -------------------------- # output stuff # -------------------------- # get_time_gap_stats(spark) export_trip_segments(spark) # export_small_area_data(spark) # export_single_id_data(spark) # export_trip_and_stop_data(spark) # -------------------------- # cleanup spark.stop() pg_pool.closeall()