def run(spark): """Indexes Protobuf files by updating the S3Prefixes and VehPosPb tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.S3Prefixes) dbtables.create_if_not_exists(conn, dbtables.VehPosPb) pfxs = explore_s3prefixes() s3Mgr = s3.S3Mgr() for pfx in pfxs: fullPfx = '/'.join(("pb", "VehiclePos", pfx)) keys = s3Mgr.fetch_keys(fullPfx) if len(keys) > 0: log.info("PROCESSING %d KEYS FOR %s", len(keys), pfx) file_list = spark.sparkContext.parallelize(keys) file_list \ .map(dbtables.VehPosPb.build_tuple_from_protobuf) \ .foreachPartition(push_vehpospb_dbtpls) log.info("PROCESSED %d KEYS FOR %s", len(keys), pfx) tpl = (pfx, len(keys)) with DBConn() as conn: dbtables.S3Prefixes.insert_values(conn, pfx, len(keys)) conn.commit() log.info("PUSHED S3Prefix %s", str(tpl))
def run(spark): """Updates the vehicle positions database table Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.VehPos) with DBConn() as conn: keys = dbtables.VehPosPb.select_protobuf_keys_not_invehpos(conn) log.info("Got %d keys", len(keys)) step = 1000 for i in range(0, len(keys), step): lower = i upper = i + step if i + step < len(keys) else len(keys) keysSubrange = keys[lower:upper] records = spark.sparkContext \ .parallelize(keysSubrange) \ .flatMap(dbtables.VehPos.build_db_tuples_from_pb) \ .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \ .reduceByKey(lambda x, y: x) records.foreachPartition(push_vehpos_db) log.info("Inserted records for keys %d-%d", lower, upper - 1) spark.sparkContext \ .parallelize(keysSubrange) \ .foreachPartition(set_vehpospb_invehpos) log.info("Updated IsInVehPos for keys %d-%d", lower, upper - 1)
def run(spark): """Updates Parquet files in S3 and the PqDate table Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.PqDates) targetDates = fetch_parquet_dts() for targetDate in targetDates: keys = fetch_keys_for_date(targetDate) log.info("Got %d keys of %s", len(keys), str(targetDate)) if len(keys) > 0: rddKeys = spark.sparkContext \ .parallelize(keys) \ .map(lambda x: (x, x)) \ .partitionBy(Settings.NumPartitions) \ .map(lambda x: x[0]) rddVP = rddKeys \ .flatMap(dbtables.VehPos.build_df_tuples_from_pb) \ .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \ .reduceByKey(lambda x, y: x).map(lambda x: x[1]) schema = StructType([ StructField("RouteId", StringType(), True), StructField("DT", TimestampType(), False), StructField("VehicleId", StringType(), False), StructField("TripId", StringType(), False), StructField("Lat", DoubleType(), False), StructField("Lon", DoubleType(), False), StructField("Status", IntegerType(), True), StructField("StopSeq", IntegerType(), True), StructField("StopId", StringType(), True), ]) dfVP = spark.createDataFrame(rddVP, schema) log.info("Created dataframe for %d keys of %s", len(keys), str(targetDate)) pqKey = targetDate.strftime("%Y%m%d") pqKey = '/'.join(["parquet", "VP-" + pqKey]) pqKey = "s3a://alxga-insde/%s" % pqKey dfVP.write.format("parquet").mode("overwrite").save(pqKey) log.info("Written to Parquet %d keys of %s", len(keys), str(targetDate)) numRecs = dfVP.count() else: numRecs = 0 with DBConn() as conn: dbtables.PqDates.insert_values(conn, targetDate, len(keys), numRecs) conn.commit()
def run(spark): """Combines GTFS schedule feed with vehicle positions Parquet files and updates the VPDelays and HlyDelays tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.VPDelays) dbtables.create_if_not_exists(conn, dbtables.HlyDelays) feedDescs = GTFSFetcher.read_feed_descs() curFeedDesc = None dfStopTimes = None feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"] gtfsFetcher = GTFSFetcher(spark) # with DBConn() as conn: # entriesToProcess = dbtables.PqDates \ # .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelays') entriesToProcess = [date(2020, 8, 20)] for targetDate in entriesToProcess: if dfStopTimes is None or not curFeedDesc.includes_date(targetDate): curFeedDesc = None dfStopTimes = None for fd in feedDescs: if fd.includes_date(targetDate) and fd.includes_files(feedRequiredFiles): curFeedDesc = fd dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc) log.info('USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) break else: log.info('RE-USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) if dfStopTimes: dfVehPos = read_vp_parquet(spark, targetDate) calcVPDelays = \ VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos) dfVPDelays = calcVPDelays.create_result_df() with DBConn() as conn: dbtables.VPDelays.delete_for_parquet(conn, targetDate) conn.commit() calcVPDelays.update_db(dfVPDelays) calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays) dfHlyDelays = calcHlyDelays.create_result_df().persist() dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) dfHlyDelaysBus = dfHlyDelays.filter(dfHlyDelays.RouteId.rlike("^[0-9]")) dfHlyDelaysTrain = dfHlyDelays.filter(~dfHlyDelays.RouteId.rlike("^[0-9]")) dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) with DBConn() as conn: dbtables.HlyDelays.delete_for_parquet(conn, targetDate) conn.commit() calcHlyDelays.update_db(dfHlyDelays, targetDate) calcHlyDelays.update_db(dfGrpRoutes, targetDate) calcHlyDelays.update_db(dfGrpStops, targetDate) calcHlyDelays.update_db(dfGrpAll, targetDate) calcHlyDelays.update_db(dfGrpStopsBus, targetDate, "ALLBUSES") calcHlyDelays.update_db(dfGrpAllBus, targetDate, "ALLBUSES") calcHlyDelays.update_db(dfGrpStopsTrain, targetDate, "ALLTRAINS") calcHlyDelays.update_db(dfGrpAllTrain, targetDate, "ALLTRAINS")
def run(spark): """Combines GTFS schedule feed with vehicle positions Parquet files and updates the VPDelays and HlyDelays tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.HlyDelays) feedDescs = GTFSFetcher.read_feed_descs() curFeedDesc = None dfStopTimes = None feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"] gtfsFetcher = GTFSFetcher(spark) with DBConn() as conn: entriesToProcess = dbtables.PqDates \ .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelaysS3') for targetDate in entriesToProcess: if dfStopTimes is None or not curFeedDesc.includes_date(targetDate): curFeedDesc = None dfStopTimes = None for fd in feedDescs: if fd.includes_date(targetDate) and fd.includes_files( feedRequiredFiles): curFeedDesc = fd dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc) log.info('USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) break else: log.info('RE-USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) if dfStopTimes: dfVehPos = read_vp_parquet(spark, targetDate) calcVPDelays = \ VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos) dfVPDelays = calcVPDelays.create_result_df() cols_order = [ 'RouteId', 'StopName', 'DateEST', 'HourEST', 'AvgDelay', 'AvgDist', 'Cnt' ] calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays) dfHlyDelays = calcHlyDelays.create_result_df().persist() dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) \ .withColumn('RouteId', F.lit('ALLROUTES')) dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) \ .withColumn('RouteId', F.lit('ALLROUTES')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfHlyDelaysBus = dfHlyDelays.filter( dfHlyDelays.RouteId.rlike("^[0-9]")) dfHlyDelaysTrain = dfHlyDelays.filter( ~dfHlyDelays.RouteId.rlike("^[0-9]")) dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) \ .withColumn('RouteId', F.lit('ALLBUSES')) dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) \ .withColumn('RouteId', F.lit('ALLBUSES')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) \ .withColumn('RouteId', F.lit('ALLTRAINS')) dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) \ .withColumn('RouteId', F.lit('ALLTRAINS')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfAllHly = dfHlyDelays[cols_order] \ .union(dfGrpRoutes[cols_order]) \ .union(dfGrpStops[cols_order]) \ .union(dfGrpAll[cols_order]) \ .union(dfGrpStopsBus[cols_order]) \ .union(dfGrpAllBus[cols_order]) \ .union(dfGrpStopsTrain[cols_order]) \ .union(dfGrpAllTrain[cols_order]) with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.RouteStops) data = dfAllHly[['RouteId', 'StopName']] \ .distinct() \ .collect() dbtables.RouteStops.insert_values(conn, data) conn.commit() calcHlyDelays.update_s3(dfAllHly, targetDate) with DBConn() as conn: dbtables.PqDates.update_in_delays(conn, targetDate, "IsInHlyDelaysS3") conn.commit()