Exemplo n.º 1
0
def main():
    start_time = datetime.now()

    # get AWS creds (for running Spark outside of AWS EMR)
    session = boto3.Session()
    credentials = session.get_credentials()
    aws_access_key = credentials.access_key
    aws_secret_key = credentials.secret_key
    # aws_session_token = credentials.token

    spark = (
        SparkSession.builder.appName("gdelt_testing").config(
            "spark.sql.session.timeZone",
            "UTC").config("spark.sql.debug.maxToStringFields", 100)
        # .config("spark.hadoop.fs.s3.fast.upload", "true")
        # .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider")
        # .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
        .config("spark.hadoop.fs.s3a.access.key", aws_access_key).config(
            "spark.hadoop.fs.s3a.secret.key", aws_secret_key).config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.serializer",
                               KryoSerializer.getName).config(
                                   "spark.kryo.registrator",
                                   SedonaKryoRegistrator.getName).config(
                                       "spark.driver.maxResultSize",
                                       "0").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    sc = spark.sparkContext

    logger.info("{} initiated on PySpark {} : {}".format(
        sc.applicationId, sc.version,
        datetime.now() - start_time))
    logger.info("\t - Running on Python {}".format(
        sys.version.replace("\n", " ")))
    start_time = datetime.now()

    # load day dataframe and get stats
    day_df = spark.read.option("inferSchema", "true").csv(input_day_path)
    day_df.printSchema()
    day_df.show(5)

    # release the dataframes' memory
    day_df.unpersist()
    day_df.printSchema()

    # month_df = spark.read.parquet(input_month_path)
    #
    # year_df = spark.read.parquet(input_year_path)

    # cleanup
    spark.stop()
Exemplo n.º 2
0
    def spark(self):
        if not hasattr(self, "__spark"):
            spark = SparkSession. \
                builder. \
                config("spark.serializer", KryoSerializer.getName).\
                config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\
                master("local[*]").\
                getOrCreate()

            SedonaRegistrator.registerAll(spark)

            setattr(self, "__spark", spark)
        return getattr(self, "__spark")
Exemplo n.º 3
0
    def register_iff_file_as_sql_table(self,filename, record_types=3, callsigns=None, chunksize=50000, encoding='latin-1',query_name=None):
        from sedona.register import SedonaRegistrator
        from pyspark.sql.types import IntegerType

        SedonaRegistrator.registerAll(self.sparkSession)

        iff_schema = self.iff_schema()
        df = self.sparkSession.read.csv(filename, header=False, sep=",", schema=iff_schema)    
        
        cols = ['recType', 'recTime', 'callsign', 'latitude', 'longitude', 'altitude','heading']
        df = df.select(*cols).filter(df['recType']==3).withColumn("recTime", df['recTime'].cast(IntegerType()))
        
        if query_name is not None:
            df.registerTempTable(query_name)
        
        return df.toPandas()
Exemplo n.º 4
0
    def __init__(self):
        from pyspark.sql import SparkSession
        from sedona.register import SedonaRegistrator
        from sedona.utils import SedonaKryoRegistrator, KryoSerializer

        sparkSession = SparkSession.\
            builder.\
            master("local[*]").\
            appName("Sector_IFF_Parser").\
            config("spark.serializer", KryoSerializer.getName).\
            config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\
            config('spark.jars.packages',
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.1.1-incubating,'
            'org.datasyslab:geotools-wrapper:1.1.0-25.2'). \
            getOrCreate()

        SedonaRegistrator.registerAll(sparkSession)
        self.sparkSession = sparkSession
Exemplo n.º 5
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    spark = (
        SparkSession.builder.master(
            "local[*]").appName("gnaf-loader export").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores", 1).config(
                    "spark.cores.max",
                    num_processors).config("spark.driver.memory", "8g").config(
                        "spark.driver.maxResultSize", "1g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    print("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # get row count
    df = spark.read.parquet(input_path)

    print("{} has {} rows : {}".format(input_path, df.count(),
                                       datetime.now() - start_time))

    spark.stop()
Exemplo n.º 6
0
def create_spark_session():

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                SedonaKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "8g").getOrCreate())

    # Register Apache Sedona UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (no effect on the "small" spatial join query in this script. May improve bigger queries)
    # spark.conf.set("sedona.global.index", "true")
    # spark.conf.set("sedona.global.indextype", "rtree")
    # spark.conf.set("sedona.join.gridtype", "kdbtree")

    return spark
Exemplo n.º 7
0
def run_test(test_name, num_partitions, max_vertices):

    # create spark session object
    spark = (
        SparkSession.builder.master(
            "local[*]").appName("Spatial Join SQL Benchmark").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores",
                               4).config("spark.driver.memory",
                                         "8g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    # set Sedona spatial indexing and partitioning config in Spark session
    # (slowed down the "small" spatial join query in this script. Might improve bigger queries)
    spark.conf.set("sedona.global.index", "true")
    spark.conf.set("sedona.global.indextype", "rtree")
    spark.conf.set("sedona.join.gridtype", "kdbtree")
    spark.conf.set("sedona.join.numpartition", num_partitions)
    spark.conf.set("sedona.join.indexbuildside", "right")
    spark.conf.set("sedona.join.spatitionside", "right")

    start_time = datetime.now()

    # load gnaf points and create geoms
    point_df = (spark.read.parquet(
        os.path.join(input_path, "address_principals")).select(
            "gnaf_pid", "state", "geom").withColumnRenamed(
                "state", "gnaf_state").repartition(num_partitions,
                                                   "gnaf_state"))
    point_df.createOrReplaceTempView("pnt")

    # load boundaries and create geoms
    if max_vertices is not None:
        bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices)
    else:
        bdy_vertex_name = bdy_name

    bdy_df = (spark.read.parquet(os.path.join(
        input_path,
        bdy_vertex_name)).select(bdy_id, "state",
                                 "geom").repartition(num_partitions,
                                                     "state").cache())
    bdy_count = bdy_df.count()
    bdy_df.createOrReplaceTempView("bdy")

    # run spatial join to boundary tag the points
    sql = """SELECT pnt.gnaf_pid, bdy.{}, bdy.state FROM bdy, pnt WHERE ST_Intersects(bdy.geom, pnt.geom)""" \
        .format(bdy_id)
    join_df = spark.sql(sql)

    join_df2 = (join_df
                # .filter((join_df["state"] == join_df["gnaf_state"]))
                .dropDuplicates(["gnaf_pid", bdy_id])
                .cache()
                )

    # output to files
    if "warmup" in test_name:
        name = "gnaf_sql_{}_{}_{}".format(bdy_id, max_vertices, num_partitions)

        (join_df2.repartition(50).write.partitionBy("state").option(
            "compression",
            "gzip").mode("overwrite").parquet(os.path.join(output_path, name)))

    # output vars
    join_count = join_df2.count()
    time_taken = datetime.now() - start_time

    if "warmup" in test_name:
        print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count,
                                         max_vertices, num_partitions,
                                         time_taken))
    else:
        log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count,
                                                    bdy_count, max_vertices,
                                                    num_partitions,
                                                    time_taken))

    # cleanup
    spark.stop()
Exemplo n.º 8
0
from pyspark.sql import SparkSession

from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

spark = (SparkSession.builder.master("local[*]").appName(
    "Point Construction, Transform, and Distance Test").config(
        "spark.serializer", KryoSerializer.getName
    ).config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
        "spark.jars.packages",
        "org.apache.sedona:sedona-python-adapter-2.4_2.11:1.0.0-incubating,"
        "org.datasyslab:geotools-wrapper:geotools-24.0",
    ).getOrCreate())

SedonaRegistrator.registerAll(spark)

temp = spark.sql("""
select
ST_Point(-117.105397, 33.17972) as p1,
ST_Transform(ST_Point(-117.105397, 33.17972), 'epsg:4326', 'epsg:5071', false) as p1t,
ST_Transform(ST_FlipCoordinates(ST_Point(-117.105397, 33.17972)), 'epsg:4326', 'epsg:5071', false) as p1_flip_t
""").toPandas()
print(tabulate(temp, headers=temp.columns, tablefmt="github", showindex=False))

#  temp = spark.sql(
#  """
#  select
#  ST_Point(-117.105397, 33.17972) as p1,
#  ST_Point(-117.089177, 33.186309) as p2,
#  ST_Transform(ST_Point(-117.105397, 33.17972), 'epsg:4326', 'epsg:5071', false) as p1t,
Exemplo n.º 9
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # copy gnaf tables from Postgres to a CSV file - a one off
    #   - export required fields only and no header
    # ----------------------------------------------------------

    pg_conn = pg_pool.getconn()
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state
                 FROM gnaf_202008.{}
             ) TO STDOUT WITH CSV"""

    # address principals
    with open(gnaf_csv_file_path, 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)

    # append address aliases
    with open(gnaf_csv_file_path, 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_pool.putconn(pg_conn)

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    # upload Apache Sedona JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                SedonaKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "8g").getOrCreate())

    # Register Apache Sedona UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (no effect on the "small" spatial join query in this script. Will improve bigger queries)
    # spark.conf.set("sedona.global.index", "true")
    # spark.conf.set("sedona.global.indextype", "rtree")
    # spark.conf.set("sedona.join.gridtype", "kdbtree")

    sc = spark.sparkContext

    logger.info("\t - PySpark {} session initiated: {}".format(
        sc.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t - GNAF RDD created: {}".format(datetime.now() -
                                                   start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        bdy_tag(spark, point_rdd, bdy)

    # point_rdd.unpersist()  # no such method on a SpatialRDD

    # ----------------------------------------------------------
    # merge boundary tag dataframes with GNAF records
    #   - required because spatial joins are INNER JOIN only,
    #     need to add untagged GNAF points
    # ----------------------------------------------------------

    start_time = datetime.now()

    # create gnaf dataframe and SQL view
    gnaf_df = spark.read \
        .option("header", False) \
        .option("inferSchema", True) \
        .csv(gnaf_csv_file_path) \
        .drop("_C0") \
        .drop("_C1") \
        .withColumnRenamed("_C2", "gnaf_pid") \
        .withColumnRenamed("_C3", "locality_pid") \
        .withColumnRenamed("_C4", "locality_name") \
        .withColumnRenamed("_C5", "postcode") \
        .withColumnRenamed("_C6", "state")
    # gnaf_df.printSchema()
    # gnaf_df.show(10, False)

    gnaf_df.createOrReplaceTempView("pnt")

    # add bdy tags, one bdy type at a time
    for bdy in bdy_list:
        gnaf_df = join_bdy_tags(spark, bdy)
        gnaf_df.createOrReplaceTempView("pnt")

    # # add point geoms for output to Postgres - in the PostGIS specific EWKT format
    # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \
    #     .drop("longitude") \
    #     .drop("latitude")
    # # final_df.printSchema()
    # # final_df.show(10, False)

    logger.info("\t - Boundary tags merged: {}".format(datetime.now() -
                                                       start_time))

    # output result to Postgres
    export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags",
                       os.path.join(output_path, "temp_gnaf_with_bdy_tags"),
                       True)

    # cleanup
    spark.stop()

    # delete intermediate bdy tag files and GNAF csv file
    for bdy in bdy_list:
        shutil.rmtree(
            os.path.join(output_path, "gnaf_with_{}".format(bdy["name"])))

    os.remove(gnaf_csv_file_path)
Exemplo n.º 10
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    # upload Apache Sedona JARs
    upload_jars()

    spark = (SparkSession
             .builder
             .master("local[*]")
             .appName("query")
             .config("spark.sql.session.timeZone", "UTC")
             .config("spark.sql.debug.maxToStringFields", 100)
             .config("spark.serializer", KryoSerializer.getName)
             .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
             .config("spark.cores.max", num_processors)
             .config("spark.sql.adaptive.enabled", "true")
             .config("spark.driver.memory", "8g")
             .getOrCreate()
             )

    #              .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    # Register Apache Sedona UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(spark.sparkContext.version, datetime.now() - start_time))

    # get list of tables to export to S3
    pg_conn = psycopg2.connect(pg_connect_string)
    pg_cur = pg_conn.cursor()

    # --------------------------------------------------------------
    # import each table from each schema in Postgres &
    # export to GZIPped Parquet files in AWS S3
    # --------------------------------------------------------------

    start_time = datetime.now()

    # check what type of geometry field the table has and what it's coordinate system is
    sql = """select f_geometry_column, type, srid from public.geometry_columns
             where f_table_schema = '{}'
                 and f_table_name = '{}'""".format(schema_name, table_name)
    pg_cur.execute(sql)
    result = pg_cur.fetchone()

    if result is not None:
        geom_field = result[0]
        geom_type = result[1]
        geom_srid = result[2]
    else:
        geom_field = None
        geom_type = None
        geom_srid = None

    # build geom field sql
    # note: exported geom field will be WGS84 (EPSG:4326) Well Known Binaries (WKB)
    if geom_field is not None:
        if "POLYGON" in geom_type or "LINESTRING" in geom_type:
            geom_sql = ",ST_AsText(ST_Subdivide((ST_Dump(ST_Buffer(geom, 0.0))).geom, 512)) as wkt_geom"
        else:
            geom_sql = ",ST_AsText(geom) as wkt_geom"

        # transform geom to WGS84 if required
        if geom_srid != 4326:
            geom_sql = geom_sql.replace("(geom", "(ST_Transform(geom, 4326)")

    else:
        geom_sql = ""

    # build query to select all columns and the WKB geom if exists
    sql = """SELECT 'SELECT ' || array_to_string(ARRAY(
                 SELECT column_name
                 FROM information_schema.columns
                 WHERE table_name = '{1}'
                     AND table_schema = '{0}'
                     AND column_name NOT IN('geom')
             ), ',') || '{2} ' ||
                    'FROM {0}.{1}' AS sqlstmt"""\
        .format(schema_name, table_name, geom_sql)
    pg_cur.execute(sql)
    query = str(pg_cur.fetchone()[0])  # str is just there for intellisense in Pycharm

    # add filter to query
    query = query + " WHERE tzid LIKE 'Australia%'"
    # print(query)

    bdy_df = import_bdys(spark, query)
    export_to_parquet(bdy_df, table_name)
    copy_to_s3(schema_name, table_name)

    bdy_df.unpersist()

    logger.info("\t\t exported {} : {}".format(table_name, datetime.now() - start_time))

    # cleanup
    pg_cur.close()
    pg_conn.close()
    spark.stop()
Exemplo n.º 11
0
def main():
    start_time = datetime.now()

    # create spark session object
    spark = (
        SparkSession.builder.master("local[*]").appName("Spatial Join Test")
        # .config("spark.sql.session.timeZone", "UTC")
        # .config("spark.sql.debug.maxToStringFields", 100)
        .config("spark.serializer",
                KryoSerializer.getName).config("spark.kryo.registrator",
                                               SedonaKryoRegistrator.getName)
        # .config("spark.jars.packages",
        #         'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,'
        #         'org.datasyslab:geotools-wrapper:geotools-24.0')
        .config("spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores", 1).config(
                    "spark.cores.max",
                    num_processors).config("spark.driver.memory", "12g")
        # .config("spark.driver.maxResultSize", "2g")
        .getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (slowed down the "small" spatial join query in this script. Might improve bigger queries)
    # spark.conf.set("sedona.global.index", "true")
    # spark.conf.set("sedona.global.indextype", "rtree")
    # spark.conf.set("sedona.join.gridtype", "kdbtree")
    # spark.conf.set("sedona.join.numpartition", "-1")
    # spark.conf.set("sedona.join.indexbuildside", "right")
    # spark.conf.set("sedona.join.spatitionside", "right")

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # # load gnaf points and create geoms
    # df = spark.read \
    #     .option("header", True) \
    #     .option("inferSchema", True) \
    #     .csv(input_file_name)
    #
    # point_df = df \
    #     .withColumn("geom", f.expr("ST_Point(longitude, latitude)")) \
    #     .cache()

    point_df = spark.read.parquet(os.path.join(input_path, "address_principals")) \
        .select("gnaf_pid", "state", f.expr("ST_GeomFromWKT(wkt_geom)").alias("geom")) \
        .repartition(192, "state")
    # point_df.printSchema()
    # point_df.show()

    point_df.createOrReplaceTempView("pnt")

    logger.info("\t - Loaded {:,} GNAF points: {}".format(
        point_df.count(),
        datetime.now() - start_time))

    # boundary tag gnaf points
    bdy_tag(spark, "commonwealth_electorates_analysis", "ce_pid", 9)

    # point_df.unpersist()

    # tag_df.printSchema()

    # point_df = spark.read.parquet(os.path.join(input_path, "gnaf_with_{}".format("commonwealth_electorates")))

    # point_df.createOrReplaceTempView("pnt")

    # bdy_tag(spark, "local_government_areas", "lga_pid")
    # tag_df2.printSchema()

    # point_df.unpersist()
    #
    # point_df = spark.read.parquet(os.path.join(input_path, "gnaf_with_{}".format("local_government_areas")))
    # # point_df.createOrReplaceTempView("pnt")
    #
    # # bdy_tag(spark, "local_government_wards", "ward_pid")
    # # bdy_tag(spark, "state_lower_house_electorates", "se_lower_pid")
    # # bdy_tag(spark, "state_upper_house_electorates", "se_upper_pid")
    #
    # bdy_ids = "ce_pid text, lga_pid text"
    #
    # final_df = point_df.withColumn("wkt_geom", f.expr("concat('SRID=4326;POINT (', st_x(geom), ' ', st_y(geom), ')')"))\
    #     .drop("geom")
    # # final_df.printSchema()
    #
    # # output to postgres, via CSV
    # table_name = "gnaf_with_bdy_tags"
    # export_to_postgres(final_df, "testing2.{}".format(table_name), bdy_ids, os.path.join(output_path, table_name))

    # cleanup
    spark.stop()
Exemplo n.º 12
0
def run_test(test_name, num_partitions, max_vertices):

    # create spark session object
    spark = (
        SparkSession.builder.master(
            "local[*]").appName("Spatial Join SQL Benchmark").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores",
                               4).config("spark.driver.memory",
                                         "8g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    start_time = datetime.now()

    # load gnaf points and create geoms
    point_df = (spark.read.parquet(
        os.path.join(input_path, "address_principals")).select(
            "gnaf_pid", "state", "geom").withColumnRenamed(
                "state", "gnaf_state").repartition(num_partitions,
                                                   "gnaf_state"))

    # load boundaries and create geoms
    if max_vertices is not None:
        bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices)
    else:
        bdy_vertex_name = bdy_name

    bdy_df = (spark.read.parquet(os.path.join(
        input_path,
        bdy_vertex_name)).select(bdy_id, "state",
                                 "geom").repartition(num_partitions,
                                                     "state").cache())
    bdy_count = bdy_df.count()

    # create RDDs - analysed partitioned and indexed
    point_rdd = Adapter.toSpatialRdd(point_df, "geom")
    bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")

    point_df.unpersist()
    bdy_df.unpersist()

    point_rdd.analyze()
    bdy_rdd.analyze()

    point_rdd.spatialPartitioning(GridType.KDBTREE)
    bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())

    point_rdd.buildIndex(IndexType.RTREE, True)
    bdy_rdd.buildIndex(IndexType.RTREE, True)

    # run join query
    join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True,
                                                      True)

    # convert SedonaPairRDD to dataframe
    join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames,
                           point_rdd.fieldNames, spark)
    # join_df.printSchema()

    # | -- leftgeometry: geometry(nullable=true)
    # | -- <bdy_id>: string(nullable=true)
    # | -- state: string(nullable=true)
    # | -- rightgeometry: geometry(nullable=true)
    # | -- gnaf_pid: string(nullable=true)
    # | -- gnaf_state: string(nullable=true)

    join_df2 = (join_df
                # .filter((join_df["state"] == join_df["gnaf_state"]))
                .select("gnaf_pid", bdy_id, "state")
                .dropDuplicates(["gnaf_pid", bdy_id])
                .cache()
                )

    # output to files
    if "warmup" in test_name:
        name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions)

        (join_df2.repartition(50).write.partitionBy("state").option(
            "compression",
            "gzip").mode("overwrite").parquet(os.path.join(output_path, name)))

    # output vars
    join_count = join_df2.count()
    time_taken = datetime.now() - start_time

    if "warmup" in test_name:
        print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count,
                                         max_vertices, num_partitions,
                                         time_taken))
    else:
        log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count,
                                                    bdy_count, max_vertices,
                                                    num_partitions,
                                                    time_taken))

    # cleanup
    spark.stop()
Exemplo n.º 13
0
def main():
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (
        SparkSession.builder.appName("query").config(
            "spark.sql.session.timeZone",
            "UTC").config("spark.sql.debug.maxToStringFields", 100).
        config("spark.hadoop.fs.s3.fast.upload", "true").config(
            "spark.sql.adaptive.enabled",
            "true")  # TODO: does this split one ID into 2 or more partitions?
        .config("spark.serializer", KryoSerializer.getName).config(
            "spark.kryo.registrator",
            SedonaKryoRegistrator.getName).getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    sc = spark.sparkContext
    sc.setCheckpointDir("hdfs:///checkpoints")

    logger.info("{} initiated on PySpark {} : {}".format(
        sc.applicationId, sc.version,
        datetime.now() - start_time))
    logger.info("\t - Running on Python {}".format(
        sys.version.replace("\n", " ")))
    start_time = datetime.now()

    # -----------------------------------------------------------------------------------------
    # 1. vehicle point counts with min/max times
    # -----------------------------------------------------------------------------------------

    logger.info("1. Counts per ID")

    df = spark.read.parquet(point_source_s3_path)
    logger.info("\t - {:,} points".format(df.count()))

    count_df = df.groupBy("uid", "src") \
        .agg(f.count("*").alias("point_count"),
             f.min("time_utc").alias("min_time_utc"),
             f.max("time_utc").alias("max_time_utc")
             ) \
        .withColumn("total_hours", f.round((f.col("max_time_utc").cast(t.LongType())
                                    - f.col("min_time_utc").cast(t.LongType())).cast(t.DoubleType()) / 3600.0, 1))

    logger.info("\t - {:,} unique IDs".format(count_df.count()))

    # -----------------------------------------------------------------------------------------
    # add days active
    # -----------------------------------------------------------------------------------------

    days_df = df.groupBy("uid", f.to_date("time_utc").alias("date_utc"))\
        .agg(f.count("*").alias("point_count"))  # only required to convert GroupedData object into a DataFrame

    logger.info("\t - {:,} ID days of data".format(days_df.count()))

    days_df2 = days_df.groupBy("uid") \
        .agg(f.count("*").alias("active_day_count"))
    # days_df2.printSchema()
    # days_df2.show(20, False)

    # add data to counts
    count_df2 = count_df.join(days_df2, "uid", "inner")

    # -----------------------------------------------------------------------------------------
    # add trip counts
    # -----------------------------------------------------------------------------------------

    trip_df = spark.read.parquet(trip_source_s3_path)
    logger.info("\t - {:,} trips".format(trip_df.count()))

    trip_df2 = (trip_df.groupBy("uid").agg(
        f.count("*").alias("trip_count"),
        f.sum("point_count").alias("trip_point_count"),
        f.sum(f.round("distance_km", 3)).alias("trip_distance_km"),
        f.sum("duration_s").alias("temp_duration_s"),
    ).withColumn(
        "trip_hours",
        f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0,
                1)).withColumn(
                    "trip_avg_seconds_per_point",
                    f.round(
                        f.when(
                            f.col("temp_duration_s") > 0,
                            f.col("temp_duration_s") /
                            f.col("trip_point_count").cast(
                                t.DoubleType())).otherwise(None),
                        1)).withColumn(
                            "trip_avg_speed",
                            f.round(
                                f.when(
                                    f.col("trip_distance_km") > 0,
                                    f.col("trip_distance_km") /
                                    f.col("trip_hours")).otherwise(None),
                                1)).drop("temp_duration_s"))
    # trip_df2.printSchema()
    # trip_df2.show(10, False)

    logger.info("\t - {:,} IDs with trips".format(trip_df2.count()))

    # add trip data to counts
    count_df3 = count_df2.join(trip_df2, "uid", "left")

    # -----------------------------------------------------------------------------------------
    # add stop counts
    # -----------------------------------------------------------------------------------------

    stop_df = spark.read.parquet(stop_source_s3_path)
    logger.info("\t - {:,} stops".format(stop_df.count()))

    stop_df2 = (stop_df.groupBy("uid").agg(
        f.count("*").alias("stop_count"),
        f.sum("point_count").alias("stop_point_count"),
        f.sum(f.round("distance_km", 3)).alias("stop_distance_km"),
        f.sum("duration_s").alias("temp_duration_s"),
    ).withColumn(
        "stop_hours",
        f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0,
                1)).withColumn(
                    "stop_avg_seconds_per_point",
                    f.round(
                        f.when(
                            f.col("temp_duration_s") > 0,
                            f.col("temp_duration_s") /
                            f.col("stop_point_count").cast(
                                t.DoubleType())).otherwise(None),
                        1)).withColumn(
                            "stop_avg_speed",
                            f.round(
                                f.when(
                                    f.col("stop_distance_km") > 0,
                                    f.col("stop_distance_km") /
                                    f.col("stop_hours")).otherwise(None),
                                1)).drop("temp_duration_s"))
    # stop_df2.printSchema()
    # stop_df2.show(10, False)

    logger.info("\t - {:,} IDs with stops".format(stop_df2.count()))

    # add stop data to counts
    id_count_df = count_df3.join(stop_df2, "uid", "left") \
        .checkpoint()

    # -----------------------------------------------------------------------------------------
    # output counts
    # -----------------------------------------------------------------------------------------

    (id_count_df.repartition(1).write.option("compression", "gzip").option(
        "header", "true").option("nullValue", None).mode("overwrite").csv(
            os.path.join(target_s3_path, "counts")))

    # get IDs that have more than a fleeting amount of data
    # useful_id_df = id_count_df.filter()

    # id_count_df.unpersist()
    stop_df2.unpersist()
    count_df3.unpersist()
    trip_df2.unpersist()
    count_df2.unpersist()
    days_df2.unpersist()
    count_df.unpersist()
    df.unpersist()

    # -----------------------------------------------------------------------------------------
    # 2. counts of GPS refresh rates
    # -----------------------------------------------------------------------------------------

    logger.info("2. GPS refresh rates")

    final_point_df = spark.read.parquet(final_point_source_s3_path)
    final_point_df.createOrReplaceTempView("pnt")

    sql = """SELECT src,
                    count(*) as point_count,
                    SUM(CASE WHEN next_interval <= 5 THEN 1 ELSE 0 END) as _05_seconds,
                    SUM(CASE WHEN next_interval <= 10 THEN 1 ELSE 0 END) as _10_seconds,
                    SUM(CASE WHEN next_interval <= 15 THEN 1 ELSE 0 END) as _15_seconds,
                    SUM(CASE WHEN next_interval <= 20 THEN 1 ELSE 0 END) as _20_seconds,
                    SUM(CASE WHEN next_interval <= 30 THEN 1 ELSE 0 END) as _30_seconds,
                    SUM(CASE WHEN next_interval <= 45 THEN 1 ELSE 0 END) as _45_seconds,
                    SUM(CASE WHEN next_interval <= 60 THEN 1 ELSE 0 END) as _60_seconds,
                    SUM(CASE WHEN next_interval <= 120 THEN 1 ELSE 0 END) as _120_seconds,
                    SUM(CASE WHEN next_interval > 120 THEN 1 ELSE 0 END) as _over_120_seconds
             FROM pnt
             GROUP BY src"""
    refresh_rate_df = spark.sql(sql)
    refresh_rate_df.orderBy("src").show(20, False)

    refresh_rate_df.unpersist()
    final_point_df.unpersist()

    # -----------------------------------------------------------------------------------------
    # 3. trips per day
    # -----------------------------------------------------------------------------------------

    logger.info("3. Trips & Stops per day")

    trip_count_df = (
        trip_df.withColumn("date_local",
                           f.col("start_time_local").cast(t.DateType())).
        groupBy("src", f.col("date_local")).agg(
            f.count("*").alias("trip_count"),
            # f.count("distinct uid").alias("trip_id_count_df"),
            f.sum("point_count").alias("trip_point_count"),
            f.sum("distance_km").alias("trip_distance_km"),
            f.sum("duration_s").alias("temp_duration_s"),
        ).withColumn(
            "trip_hours",
            f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0,
                    1)).withColumn(
                        "trip_avg_seconds_per_point",
                        f.round(
                            f.when(
                                f.col("temp_duration_s") > 0,
                                f.col("temp_duration_s") /
                                f.col("trip_point_count").cast(
                                    t.DoubleType())).otherwise(None),
                            1)).withColumn(
                                "trip_avg_speed",
                                f.round(
                                    f.when(
                                        f.col("trip_distance_km") > 0,
                                        f.col("trip_distance_km") /
                                        f.col("trip_hours")).otherwise(None),
                                    1)).withColumn(
                                        "trip_distance_km",
                                        f.round(f.col("trip_distance_km"),
                                                3)).drop("temp_duration_s"))

    trip_df.unpersist()

    logger.info("\t - {:,} days of trips".format(trip_count_df.count()))

    stop_count_df = (
        stop_df.withColumn("date_local",
                           f.col("start_time_local").cast(t.DateType())).
        groupBy("src", f.col("date_local")).agg(
            f.count("*").alias("stop_count"),
            # f.count("distinct uid").alias("stop_id_count_df"),
            f.sum("point_count").alias("stop_point_count"),
            f.sum("distance_km").alias("stop_distance_km"),
            f.sum("duration_s").alias("temp_duration_s"),
        ).withColumn(
            "stop_hours",
            f.round(f.col("temp_duration_s").cast(t.DoubleType()) / 3600.0,
                    1)).withColumn(
                        "stop_avg_seconds_per_point",
                        f.round(
                            f.when(
                                f.col("temp_duration_s") > 0,
                                f.col("temp_duration_s") /
                                f.col("stop_point_count").cast(
                                    t.DoubleType())).otherwise(None),
                            1)).withColumn(
                                "stop_avg_speed",
                                f.round(
                                    f.when(
                                        f.col("stop_distance_km") > 0,
                                        f.col("stop_distance_km") /
                                        f.col("stop_hours")).otherwise(None),
                                    1)).withColumn(
                                        "stop_distance_km",
                                        f.round(f.col("stop_distance_km"),
                                                3)).drop("temp_duration_s"))

    stop_df.unpersist()

    logger.info("\t - {:,} days of stops".format(stop_count_df.count()))

    # combine trip and stop counts
    trip_stop_count = trip_count_df.alias("left").join(
        stop_count_df.alias("right"), ["src", "date_local"], "full")
    # trip_stop_count.printSchema()
    # trip_stop_count.show(20, False)

    (trip_stop_count.repartition(1).write.option("compression", "gzip").option(
        "header", "true").option("nullValue", None).mode("overwrite").csv(
            os.path.join(target_s3_path, "daily_trip_stop_counts")))

    # cleanup
    spark.stop()
Exemplo n.º 14
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    spark = (
        SparkSession.builder.master(
            "local[*]").appName("gnaf-loader export").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores", 1).config(
                    "spark.cores.max",
                    num_processors).config("spark.driver.memory", "8g").config(
                        "spark.driver.maxResultSize", "1g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))

    # get list of tables to export to S3
    pg_conn = psycopg2.connect(pg_connect_string)
    pg_cur = pg_conn.cursor()

    # --------------------------------------------------------------
    # import each table from each schema in Postgres &
    # export to GZIPped Parquet files in AWS S3
    # --------------------------------------------------------------

    for schema_name in schema_names:
        i = 1

        # get table list for schema
        sql = """SELECT table_name
                 FROM information_schema.tables
                 WHERE table_schema='{}'
                   AND table_type='BASE TABLE'
                   AND table_name <> 'qa'
                   AND table_name NOT LIKE '%_2011_%'
                   AND table_name NOT LIKE '%_analysis%'
                   AND table_name NOT LIKE '%_display%'""".format(schema_name)
        pg_cur.execute(sql)

        tables = pg_cur.fetchall()

        logger.info("\t - {} schema : {} tables to export : {}".format(
            schema_name, len(tables),
            datetime.now() - start_time))

        for table in tables:
            start_time = datetime.now()

            table_name = table[0]

            # check what type of geometry field the table has and what it's coordinate system is
            sql = """SELECT f_geometry_column, type, srid FROM public.geometry_columns
                     WHERE f_table_schema = '{}'
                         AND f_table_name = '{}'""".format(
                schema_name, table_name)
            pg_cur.execute(sql)
            result = pg_cur.fetchone()

            if result is not None:
                geom_field = result[0]
                geom_type = result[1]
                geom_srid = result[2]
            else:
                geom_field = None
                geom_type = None
                geom_srid = None

            # build geom field sql
            # note: exported geom field will be WGS84 (EPSG:4326) Well Known Binaries (WKB)
            if geom_field is not None:
                if "POLYGON" in geom_type or "LINESTRING" in geom_type:
                    geom_sql = ",ST_AsText(ST_Subdivide((ST_Dump(ST_Buffer(geom, 0.0))).geom, 256)) as wkt_geom"
                else:
                    geom_sql = ",ST_AsText(geom) as wkt_geom"

                # transform geom to WGS84 if required
                if geom_srid != 4326:
                    geom_sql = geom_sql.replace("(geom",
                                                "(ST_Transform(geom, 4326)")

            else:
                geom_sql = ""

            # build query to select all columns and the WKB geom if exists
            sql = """SELECT 'SELECT ' || array_to_string(ARRAY(
                         SELECT column_name
                         FROM information_schema.columns
                         WHERE table_name = '{1}'
                             AND table_schema = '{0}'
                             AND column_name NOT IN('geom')
                     ), ',') || '{2} ' ||
                            'FROM {0}.{1}' AS sqlstmt"""\
                .format(schema_name, table_name, geom_sql)
            pg_cur.execute(sql)
            query = str(pg_cur.fetchone()
                        [0])  # str is just there for intellisense in Pycharm

            # get min and max gid values to enable parallel import from Postgres to Spark
            # add gid field based on row number if missing
            if "gid," in query:
                sql = """SELECT min(gid), max(gid) FROM {}.{}""".format(
                    schema_name, table_name)
                pg_cur.execute(sql)
                gid_range = pg_cur.fetchone()
                min_gid = gid_range[0]
                max_gid = gid_range[1]
            else:
                # get row count as the max gid value
                sql = """SELECT count(*) FROM {}.{}""".format(
                    schema_name, table_name)
                pg_cur.execute(sql)
                min_gid = 1
                max_gid = pg_cur.fetchone()[0]

                # add gid field to query
                query = query.replace("SELECT ",
                                      "SELECT row_number() OVER () AS gid,")

            # check table has records
            if max_gid is not None and max_gid > min_gid:
                bdy_df = import_table(spark, query, min_gid, max_gid, 100000)
                # bdy_df.printSchema()

                # # add geometry column if required
                # if geom_sql != "":
                #     export_df = bdy_df.withColumn("geom", f.expr("ST_GeomFromWKT(wkt_geom)")) \
                #         .drop("wkt_geom")
                # else:
                #     export_df = bdy_df
                #
                # export_to_parquet(export_df, table_name)

                export_to_parquet(bdy_df, table_name)
                # copy_to_s3(schema_name, table_name)

                bdy_df.unpersist()

                logger.info("\t\t {}. exported {} : {}".format(
                    i, table_name,
                    datetime.now() - start_time))
            else:
                logger.warning("\t\t {}. {} has no records! : {}".format(
                    i, table_name,
                    datetime.now() - start_time))

            i += 1

    # cleanup
    pg_cur.close()
    pg_conn.close()
    spark.stop()
Exemplo n.º 15
0
def main():
    start_time = datetime.now()

    # copy gnaf tables to CSV
    pg_conn = psycopg2.connect(local_pg_connect_string)
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, state
                 FROM gnaf_202011.{}
             ) TO STDOUT WITH CSV"""
    # sql = """COPY (
    #              SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name,
    #                     lot_number, flat_number, level_number, number_first, number_last, street_name, street_type,
    #                     street_suffix, address, locality_name, postcode, state, locality_postcode, confidence,
    #                     legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability
    #              FROM gnaf_202011.{}
    #          ) TO STDOUT WITH CSV"""

    # address principals
    with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)
        # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file)

    # address aliases
    with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_conn.close()

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                SedonaKryoRegistrator.getName).config(
                    "spark.cores.max",
                    cpu_count()).config("spark.sql.adaptive.enabled",
                                        "true").config("spark.driver.memory",
                                                       "8g").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # load gnaf points
    df = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .csv(input_file_name)
    # df.printSchema()
    # df.show()

    # # manually assign field types (not needed here as inferSchema works)
    # df2 = (df
    #        .withColumn("confidence", df.confidence.cast(t.ShortType()))
    #        .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType()))
    #        .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType()))
    #        .withColumn("reliability", df.reliability.cast(t.ShortType()))
    #        .withColumn("longitude", df.longitude.cast(t.DoubleType()))
    #        .withColumn("latitude", df.latitude.cast(t.DoubleType()))
    #        )
    # # df2.printSchema()
    # # df2.show()

    # add point geometries and partition by longitude into 400-500k row partitions
    gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)"))
    # .withColumnRenamed("gnaf_pid", "id")
    # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0))
    #             .cast(t.ShortType())) \
    # .repartitionByRange(100, "partition_id") \
    # gnaf_df.printSchema()

    # check partition counts
    gnaf_df.groupBy(f.spark_partition_id()).count().show()

    # write gnaf to gzipped parquet
    export_to_parquet(gnaf_df, "gnaf")

    # export PG boundary tables to parquet
    export_bdys(spark, "commonwealth_electorates", "ce_pid")
    export_bdys(spark, "local_government_areas", "lga_pid")
    export_bdys(spark, "local_government_wards", "ward_pid")
    export_bdys(spark, "state_lower_house_electorates", "se_lower_pid")
    export_bdys(spark, "state_upper_house_electorates", "se_upper_pid")

    # cleanup
    spark.stop()

    logger.info(
        "\t - GNAF and boundaries exported to gzipped parquet files: {}".
        format(datetime.now() - start_time))
Exemplo n.º 16
0
def main():
    start_time = datetime.now()

    # create spark session object
    spark = (
        SparkSession.builder.master(
            "local[*]").appName("Spatial Join Test").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores", 1).config(
                    "spark.cores.max",
                    num_processors).config("spark.driver.memory", "8g").config(
                        "spark.driver.maxResultSize", "1g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # load boundaries (geometries are Well Known Text strings)
    bdy_wkt_df = spark.read.parquet(os.path.join(input_path, "boundaries"))
    # bdy_wkt_df.printSchema()
    # bdy_wkt_df.show(5)

    # create view to enable SQL queries
    bdy_wkt_df.createOrReplaceTempView("bdy_wkt")

    # create geometries from WKT strings into new DataFrame
    # new DF will be spatially indexed automatically
    sql = "select bdy_id, state, ST_GeomFromWKT(wkt_geom) as geometry from bdy_wkt"
    bdy_df = spark.sql(sql).repartition(96, "state")

    # repartition and cache for performance (no effect on the "small" spatial join query here)
    # bdy_df.repartition(spark.sparkContext.defaultParallelism).cache().count()
    # bdy_df.printSchema()
    # bdy_df.show(5)

    # create view to enable SQL queries
    bdy_df.createOrReplaceTempView("bdy")

    logger.info("\t - Loaded and spatially enabled {:,} boundaries: {}".format(
        bdy_df.count(),
        datetime.now() - start_time))
    start_time = datetime.now()

    # load points (spatial data is lat/long fields)
    point_wkt_df = spark.read.parquet(os.path.join(input_path, "points"))
    # point_wkt_df.printSchema()
    # point_wkt_df.show(5)

    # create view to enable SQL queries
    point_wkt_df.createOrReplaceTempView("point_wkt")

    # create geometries from lat/long fields into new DataFrame
    # new DF will be spatially indexed automatically
    sql = "select point_id, state, ST_Point(longitude, latitude) as geometry from point_wkt"
    point_df = spark.sql(sql).repartition(96, "state")

    # repartition and cache for performance (no effect on the "small" spatial join query here)
    # point_df.repartition(spark.sparkContext.defaultParallelism).cache().count()
    # point_df.printSchema()
    # point_df.show(5)

    # create view to enable SQL queries
    point_df.createOrReplaceTempView("pnt")

    logger.info("\t - Loaded and spatially enabled {:,} points: {}".format(
        point_df.count(),
        datetime.now() - start_time))
    start_time = datetime.now()

    # run spatial join to boundary tag the points
    # notes:
    #   - spatial partitions and indexes for join will be created automatically
    #   - it's an inner join so point records could be lost
    sql = """SELECT pnt.point_id,
                    bdy.bdy_id,
                    bdy.state,
                    pnt.geometry
             FROM pnt
             INNER JOIN bdy ON ST_Intersects(pnt.geometry, bdy.geometry)"""
    join_df = spark.sql(sql)
    # join_df.explain()

    # # output join DataFrame
    # join_df.write.option("compression", "gzip") \
    #     .mode("overwrite") \
    #     .parquet(os.path.join(input_path, "output"))

    num_joined_points = join_df.count()

    join_df.printSchema()
    join_df.orderBy(f.rand()).show(5, False)

    logger.info("\t - {:,} points were boundary tagged: {}".format(
        num_joined_points,
        datetime.now() - start_time))

    # cleanup
    spark.stop()
Exemplo n.º 17
0
def main():
    start_time = datetime.now()

    # upload Sedona (sedona) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                SedonaKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "12g").getOrCreate())
    #              .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")
    #              .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")
    #              .config("spark.sql.autoBroadcastJoinThreshold", -1)
    #              .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    #              .config("spark.driver.maxResultSize", "1g")
    #
    #              .config("spark.executor.cores", 1)
    #              .config("spark.executor.memory", "2g")

    # Register Apache Sedona (geospark) UDTs and UDFs
    SedonaRegistrator.registerAll(spark)

    logger.info("PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    logger.info("\t - Running on Python {}".format(
        sys.version.replace("\n", " ")))
    start_time = datetime.now()

    # # load gzip csv files
    # df = spark.read.csv(input_file_name)
    # # df = spark.read.csv(os.path.join(output_path, "testing"))
    # # df = spark.read.csv(os.path.join(output_path, "sydney"))
    # # df.printSchema()
    # # df.show()
    #
    # # # create small dataset to speed testing up
    # # testing_df = df.filter(f.col("_c0").isin(vehicle_id_list)).cache()
    # # print(testing_df.count())
    # # testing_df.repartition(1).write.option("compression", "gzip") \
    # #     .mode("overwrite") \
    # #     .csv(os.path.join(output_path, "testing"))
    #
    # # fix column types and names - for some unknown reason it's 3-4x faster than enforcing schema on load
    # df2 = (df.withColumnRenamed("_c0", "vehicle_id")
    #        .withColumn("longitude", df["_c1"].cast(t.DoubleType()))
    #        .withColumn("latitude", df["_c2"].cast(t.DoubleType()))
    #        .withColumn("speed", df["_c3"].cast(t.DoubleType()))
    #        .withColumn("bearing", df["_c4"].cast(t.DoubleType()))
    #        .withColumn("time_utc", df["_c5"].cast(t.TimestampType()))
    #        .withColumn("unix_time", df["_c6"].cast(t.IntegerType()))
    #        .withColumn("geom", f.expr("st_point(longitude, latitude)"))
    #        .drop("_c1")
    #        .drop("_c2")
    #        .drop("_c3")
    #        .drop("_c4")
    #        .drop("_c5")
    #        .drop("_c6")
    #        .repartition(f.to_date(f.col("time_utc")))
    #        )
    # # df2.printSchema()
    # # df2.show(10, False)
    #
    # df2.write.option("compression", "gzip") \
    #     .mode("overwrite") \
    #     .parquet(os.path.join(output_path, "step_1_schema_applied"))
    #
    # df.unpersist()
    # df2.unpersist()

    schema_df = spark.read.parquet(
        os.path.join(output_path, "step_1_schema_applied"))
    schema_df.createOrReplaceTempView("point")

    # # # get counts
    # # sql = """SELECT count(distinct vehicle_id) as unique_id_count,
    # #                 count(*) as point_count
    # #          FROM point"""
    # # area_df = spark.sql(sql)
    # # area_df.show()
    #
    # logger.info("Step 1 : {} points loaded : {}".format(schema_df.count(), datetime.now() - start_time))
    # # start_time = datetime.now()

    # --------------------------
    # output stuff
    # --------------------------

    # get_time_gap_stats(spark)
    export_trip_segments(spark)
    # export_small_area_data(spark)
    # export_single_id_data(spark)
    # export_trip_and_stop_data(spark)

    # --------------------------

    # cleanup
    spark.stop()
    pg_pool.closeall()