def main(): """Checks for feed updates on the MBTA website and saves any updates to S3 """ with DBConn() as conn: wh_stmt = 'IsInHlyDelaysS3 AND NOT IsInHlyDelaysDyn' entriesToProcess = dbtables.PqDates \ .select_pqdates_not_in_delays(conn, wh_stmt) for targetDate in entriesToProcess: _process_pqdate(targetDate) with DBConn() as conn: dbtables.PqDates.update_in_delays(conn, targetDate, "IsInHlyDelaysDyn") conn.commit()
def _push_hlydelays_dbtpls(rows, pqDate, noRouteVal): with DBConn() as conn: for row in rows: dbtables.HlyDelays.insert_row(conn, row, pqDate, noRouteVal) if conn.uncommited % 1000 == 0: conn.commit() conn.commit()
def run(spark): """Indexes Protobuf files by updating the S3Prefixes and VehPosPb tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.S3Prefixes) dbtables.create_if_not_exists(conn, dbtables.VehPosPb) pfxs = explore_s3prefixes() s3Mgr = s3.S3Mgr() for pfx in pfxs: fullPfx = '/'.join(("pb", "VehiclePos", pfx)) keys = s3Mgr.fetch_keys(fullPfx) if len(keys) > 0: log.info("PROCESSING %d KEYS FOR %s", len(keys), pfx) file_list = spark.sparkContext.parallelize(keys) file_list \ .map(dbtables.VehPosPb.build_tuple_from_protobuf) \ .foreachPartition(push_vehpospb_dbtpls) log.info("PROCESSED %d KEYS FOR %s", len(keys), pfx) tpl = (pfx, len(keys)) with DBConn() as conn: dbtables.S3Prefixes.insert_values(conn, pfx, len(keys)) conn.commit() log.info("PUSHED S3Prefix %s", str(tpl))
def explore_s3prefixes(): """Computes a list of S3 prefixes under which to check for new Protobuf files A prefix is a combination of date and an hour when the Protobufs were downloaded. Each hour starting on 2020/01/01 00:00 and ending 70 minutes before the current time is returned unless it's marked in the S3Prefixes table as processed. """ ret = [] pfxDT = datetime(2020, 1, 1) utcNow = datetime.utcnow() pfxs = [] while pfxDT + timedelta(hours=1, minutes=10) < utcNow: pfxs.append(pfxDT.strftime("%Y%m%d/%H")) pfxDT += timedelta(hours=1) with DBConn() as conn: exPrefixes = dbtables.S3Prefixes.select_prefixes_dict(conn) for pfx in pfxs: if pfx not in exPrefixes: ret.append(pfx) return ret
def run(spark): """Updates the vehicle positions database table Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.VehPos) with DBConn() as conn: keys = dbtables.VehPosPb.select_protobuf_keys_not_invehpos(conn) log.info("Got %d keys", len(keys)) step = 1000 for i in range(0, len(keys), step): lower = i upper = i + step if i + step < len(keys) else len(keys) keysSubrange = keys[lower:upper] records = spark.sparkContext \ .parallelize(keysSubrange) \ .flatMap(dbtables.VehPos.build_db_tuples_from_pb) \ .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \ .reduceByKey(lambda x, y: x) records.foreachPartition(push_vehpos_db) log.info("Inserted records for keys %d-%d", lower, upper - 1) spark.sparkContext \ .parallelize(keysSubrange) \ .foreachPartition(set_vehpospb_invehpos) log.info("Updated IsInVehPos for keys %d-%d", lower, upper - 1)
def _push_vpdelays_dbtpls(rows, pqDate): with DBConn() as conn: for row in rows: dbtables.VPDelays.insert_row(conn, row, pqDate) if conn.uncommited % 1000 == 0: conn.commit() conn.commit()
def run(spark): """Updates Parquet files in S3 and the PqDate table Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.PqDates) targetDates = fetch_parquet_dts() for targetDate in targetDates: keys = fetch_keys_for_date(targetDate) log.info("Got %d keys of %s", len(keys), str(targetDate)) if len(keys) > 0: rddKeys = spark.sparkContext \ .parallelize(keys) \ .map(lambda x: (x, x)) \ .partitionBy(Settings.NumPartitions) \ .map(lambda x: x[0]) rddVP = rddKeys \ .flatMap(dbtables.VehPos.build_df_tuples_from_pb) \ .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \ .reduceByKey(lambda x, y: x).map(lambda x: x[1]) schema = StructType([ StructField("RouteId", StringType(), True), StructField("DT", TimestampType(), False), StructField("VehicleId", StringType(), False), StructField("TripId", StringType(), False), StructField("Lat", DoubleType(), False), StructField("Lon", DoubleType(), False), StructField("Status", IntegerType(), True), StructField("StopSeq", IntegerType(), True), StructField("StopId", StringType(), True), ]) dfVP = spark.createDataFrame(rddVP, schema) log.info("Created dataframe for %d keys of %s", len(keys), str(targetDate)) pqKey = targetDate.strftime("%Y%m%d") pqKey = '/'.join(["parquet", "VP-" + pqKey]) pqKey = "s3a://alxga-insde/%s" % pqKey dfVP.write.format("parquet").mode("overwrite").save(pqKey) log.info("Written to Parquet %d keys of %s", len(keys), str(targetDate)) numRecs = dfVP.count() else: numRecs = 0 with DBConn() as conn: dbtables.PqDates.insert_values(conn, targetDate, len(keys), numRecs) conn.commit()
def fetch_keys_for_date(dt): """Retrieves Protobuf files S3 keys for a Parquet date Args: dt: target Parquet file date """ with DBConn() as conn: # we define new day to start at 8:00 UTC (3 or 4 at night Boston time) dt1 = datetime(dt.year, dt.month, dt.day, 8) dt2 = dt1 + timedelta(days=1) return dbtables.VehPosPb.select_protobuf_keys_between_dates(conn, dt1, dt2)
def set_vehpospb_invehpos(objKeys): """Marks S3 Protobuf keys as processed into the VehPos table Args: objKeys: keys for the Protobuf S3 objects """ with DBConn() as conn: for objKey in objKeys: dbtables.VehPosPb.update_invehpos(conn, objKey) if conn.uncommited >= 100: conn.commit() conn.commit()
def push_vehpospb_dbtpls(tpls): """Pushes records to the VehPosPb table Args: tpls: records to add, each contains metadata for a single Protobuf file """ with DBConn() as conn: for tpl in tpls: dbtables.VehPosPb.insert_tpl(conn, tpl) if conn.uncommited % 1000 == 0: conn.commit() conn.commit()
def _db_get_routeids(): sqlStmt = """ SELECT DISTINCT RouteId FROM RouteStops WHERE RouteId <> 'ALLROUTES' AND RouteId <> 'ALLTRAINS' AND RouteId <> 'ALLBUSES' ORDER BY 1; """ ret = [] with DBConn() as con: cur = con.execute(sqlStmt) for row in cur: if row[0]: ret.append(row[0]) return ret
def _delete_for_pq_dates(pqDates): with DBConn() as conn: route_stops = dbtables.RouteStops.select_all(conn) prtn_keys = [f'{x[0]}:::[{x[1]}]' for x in route_stops] dynMgr = dyndb.DynDBMgr() mxdstr = '0' if Settings.MaxAbsDelay <= 0 else str(Settings.MaxAbsDelay) dynTbl = dynMgr.table(f'hlydelays{mxdstr}') count = 0 total = len(prtn_keys) * len(pqDates) with dynTbl.batch_writer() as dynWriter: for pqDate in pqDates: sort_key = pqDate.strftime('%Y%m%d') for prtn_key in prtn_keys: tblKey = {'route_stop': prtn_key, 'date': sort_key} dynWriter.delete_item(tblKey) count += 1 if count % 100 == 0: logger.info(f'Deleted {count} of {total} keys')
def push_vehpos_db(keyTpls): """Adds multiple records to the VehPos table Args: keyTpls: a tuple of the form (key, tpls) where key is unused and tpls are inserted into the table """ with DBConn() as conn: tpls = [] for keyTpl in keyTpls: tpls.append(keyTpl[1]) if len(tpls) >= 100: dbtables.VehPos.insert_tpls(conn, tpls) conn.commit() tpls = [] if len(tpls) > 0: dbtables.VehPos.insert_tpls(conn, tpls) conn.commit()
def main(): """Compresses text files from obsolete feeds in S3, uploads archives to S3, and removes the text files Uses tar.bz2 format """ s3Mgr = s3.S3Mgr() objKey = '/'.join(["GTFS", "MBTA_archived_feeds.txt"]) content = s3Mgr.fetch_object_body(objKey) feedDescs = gtfs.read_feed_descs(content) with DBConn() as conn: dtNow = dbtables.PqDates.select_latest_processed(conn) if not dtNow: return logger.info('Latest processed parquet date is %s' % str(dtNow)) for fd in feedDescs: daysDiff = (dtNow - fd.endDate).total_seconds() / (24 * 3600) if daysDiff > Settings.GTFS_ObsoleteAfterDays: archive_gtfs_files(s3Mgr, fd)
def fetch_parquet_dts(): """Computes dates for which Parquet files need to be created """ ret = [] pfxDT = datetime(2020, 1, 1) utcNow = datetime.utcnow() dts = [] # we define new day to start at 8:00 UTC (3 or 4 at night Boston time) while pfxDT + timedelta(days=1, hours=8, minutes=10) < utcNow: dts.append(pfxDT) pfxDT += timedelta(days=1) with DBConn() as conn: exD = dbtables.PqDates.select_existing_pqdates(conn) for dt in dts: if dt.date() not in exD: ret.append(dt) return ret
def _get_stopnames_db(routeId, q): sqlStmt = """ SELECT DISTINCT StopName FROM RouteStops """ if routeId: sqlStmt += " WHERE RouteId = %s" params = (routeId, ) else: params = None sqlStmt += " ORDER BY 1;" ret = [] with DBConn() as con: cur = con.execute(sqlStmt, params) for row in cur: if not row[0] or row[0] == 'ALLSTOPS': continue stopName = row[0].strip() stopNameLower = stopName.lower() if stopNameLower.startswith(q): ret.append(stopName) return ret
def run(spark): """Combines GTFS schedule feed with vehicle positions Parquet files and updates the VPDelays and HlyDelays tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.HlyDelays) feedDescs = GTFSFetcher.read_feed_descs() curFeedDesc = None dfStopTimes = None feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"] gtfsFetcher = GTFSFetcher(spark) with DBConn() as conn: entriesToProcess = dbtables.PqDates \ .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelaysS3') for targetDate in entriesToProcess: if dfStopTimes is None or not curFeedDesc.includes_date(targetDate): curFeedDesc = None dfStopTimes = None for fd in feedDescs: if fd.includes_date(targetDate) and fd.includes_files( feedRequiredFiles): curFeedDesc = fd dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc) log.info('USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) break else: log.info('RE-USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) if dfStopTimes: dfVehPos = read_vp_parquet(spark, targetDate) calcVPDelays = \ VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos) dfVPDelays = calcVPDelays.create_result_df() cols_order = [ 'RouteId', 'StopName', 'DateEST', 'HourEST', 'AvgDelay', 'AvgDist', 'Cnt' ] calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays) dfHlyDelays = calcHlyDelays.create_result_df().persist() dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) \ .withColumn('RouteId', F.lit('ALLROUTES')) dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) \ .withColumn('RouteId', F.lit('ALLROUTES')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfHlyDelaysBus = dfHlyDelays.filter( dfHlyDelays.RouteId.rlike("^[0-9]")) dfHlyDelaysTrain = dfHlyDelays.filter( ~dfHlyDelays.RouteId.rlike("^[0-9]")) dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) \ .withColumn('RouteId', F.lit('ALLBUSES')) dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) \ .withColumn('RouteId', F.lit('ALLBUSES')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) \ .withColumn('RouteId', F.lit('ALLTRAINS')) dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) \ .withColumn('RouteId', F.lit('ALLTRAINS')) \ .withColumn('StopName', F.lit('ALLSTOPS')) dfAllHly = dfHlyDelays[cols_order] \ .union(dfGrpRoutes[cols_order]) \ .union(dfGrpStops[cols_order]) \ .union(dfGrpAll[cols_order]) \ .union(dfGrpStopsBus[cols_order]) \ .union(dfGrpAllBus[cols_order]) \ .union(dfGrpStopsTrain[cols_order]) \ .union(dfGrpAllTrain[cols_order]) with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.RouteStops) data = dfAllHly[['RouteId', 'StopName']] \ .distinct() \ .collect() dbtables.RouteStops.insert_values(conn, data) conn.commit() calcHlyDelays.update_s3(dfAllHly, targetDate) with DBConn() as conn: dbtables.PqDates.update_in_delays(conn, targetDate, "IsInHlyDelaysS3") conn.commit()
def run(spark): """Combines GTFS schedule feed with vehicle positions Parquet files and updates the VPDelays and HlyDelays tables Args: spark: Spark Session object """ log = utils.get_logger() with DBConnCommonQueries() as conn: dbtables.create_if_not_exists(conn, dbtables.VPDelays) dbtables.create_if_not_exists(conn, dbtables.HlyDelays) feedDescs = GTFSFetcher.read_feed_descs() curFeedDesc = None dfStopTimes = None feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"] gtfsFetcher = GTFSFetcher(spark) # with DBConn() as conn: # entriesToProcess = dbtables.PqDates \ # .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelays') entriesToProcess = [date(2020, 8, 20)] for targetDate in entriesToProcess: if dfStopTimes is None or not curFeedDesc.includes_date(targetDate): curFeedDesc = None dfStopTimes = None for fd in feedDescs: if fd.includes_date(targetDate) and fd.includes_files(feedRequiredFiles): curFeedDesc = fd dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc) log.info('USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) break else: log.info('RE-USING FEED "%s" for %s', curFeedDesc.version, targetDate.strftime("%Y-%m-%d")) if dfStopTimes: dfVehPos = read_vp_parquet(spark, targetDate) calcVPDelays = \ VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos) dfVPDelays = calcVPDelays.create_result_df() with DBConn() as conn: dbtables.VPDelays.delete_for_parquet(conn, targetDate) conn.commit() calcVPDelays.update_db(dfVPDelays) calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays) dfHlyDelays = calcHlyDelays.create_result_df().persist() dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) dfHlyDelaysBus = dfHlyDelays.filter(dfHlyDelays.RouteId.rlike("^[0-9]")) dfHlyDelaysTrain = dfHlyDelays.filter(~dfHlyDelays.RouteId.rlike("^[0-9]")) dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) with DBConn() as conn: dbtables.HlyDelays.delete_for_parquet(conn, targetDate) conn.commit() calcHlyDelays.update_db(dfHlyDelays, targetDate) calcHlyDelays.update_db(dfGrpRoutes, targetDate) calcHlyDelays.update_db(dfGrpStops, targetDate) calcHlyDelays.update_db(dfGrpAll, targetDate) calcHlyDelays.update_db(dfGrpStopsBus, targetDate, "ALLBUSES") calcHlyDelays.update_db(dfGrpAllBus, targetDate, "ALLBUSES") calcHlyDelays.update_db(dfGrpStopsTrain, targetDate, "ALLTRAINS") calcHlyDelays.update_db(dfGrpAllTrain, targetDate, "ALLTRAINS")