Exemplo n.º 1
0
def main():
    """Checks for feed updates on the MBTA website and saves any updates to S3
  """

    with DBConn() as conn:
        wh_stmt = 'IsInHlyDelaysS3 AND NOT IsInHlyDelaysDyn'
        entriesToProcess = dbtables.PqDates \
          .select_pqdates_not_in_delays(conn, wh_stmt)
    for targetDate in entriesToProcess:
        _process_pqdate(targetDate)
        with DBConn() as conn:
            dbtables.PqDates.update_in_delays(conn, targetDate,
                                              "IsInHlyDelaysDyn")
            conn.commit()
Exemplo n.º 2
0
 def _push_hlydelays_dbtpls(rows, pqDate, noRouteVal):
   with DBConn() as conn:
     for row in rows:
       dbtables.HlyDelays.insert_row(conn, row, pqDate, noRouteVal)
       if conn.uncommited % 1000 == 0:
         conn.commit()
     conn.commit()
Exemplo n.º 3
0
def run(spark):
    """Indexes Protobuf files by updating the S3Prefixes and VehPosPb tables

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.S3Prefixes)
        dbtables.create_if_not_exists(conn, dbtables.VehPosPb)

    pfxs = explore_s3prefixes()
    s3Mgr = s3.S3Mgr()
    for pfx in pfxs:
        fullPfx = '/'.join(("pb", "VehiclePos", pfx))
        keys = s3Mgr.fetch_keys(fullPfx)
        if len(keys) > 0:
            log.info("PROCESSING %d KEYS FOR %s", len(keys), pfx)
            file_list = spark.sparkContext.parallelize(keys)
            file_list \
              .map(dbtables.VehPosPb.build_tuple_from_protobuf) \
              .foreachPartition(push_vehpospb_dbtpls)
            log.info("PROCESSED %d KEYS FOR %s", len(keys), pfx)
        tpl = (pfx, len(keys))

        with DBConn() as conn:
            dbtables.S3Prefixes.insert_values(conn, pfx, len(keys))
            conn.commit()
        log.info("PUSHED S3Prefix %s", str(tpl))
Exemplo n.º 4
0
def explore_s3prefixes():
    """Computes a list of S3 prefixes under which to check for new
  Protobuf files

  A prefix is a combination of date and an hour when the Protobufs
  were downloaded. Each hour starting on 2020/01/01 00:00 and
  ending 70 minutes before the current time is returned unless it's marked
  in the S3Prefixes table as processed.
  """

    ret = []
    pfxDT = datetime(2020, 1, 1)
    utcNow = datetime.utcnow()
    pfxs = []
    while pfxDT + timedelta(hours=1, minutes=10) < utcNow:
        pfxs.append(pfxDT.strftime("%Y%m%d/%H"))
        pfxDT += timedelta(hours=1)

    with DBConn() as conn:
        exPrefixes = dbtables.S3Prefixes.select_prefixes_dict(conn)

    for pfx in pfxs:
        if pfx not in exPrefixes:
            ret.append(pfx)

    return ret
Exemplo n.º 5
0
def run(spark):
    """Updates the vehicle positions database table

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.VehPos)

    with DBConn() as conn:
        keys = dbtables.VehPosPb.select_protobuf_keys_not_invehpos(conn)
    log.info("Got %d keys", len(keys))

    step = 1000
    for i in range(0, len(keys), step):
        lower = i
        upper = i + step if i + step < len(keys) else len(keys)
        keysSubrange = keys[lower:upper]
        records = spark.sparkContext \
          .parallelize(keysSubrange) \
          .flatMap(dbtables.VehPos.build_db_tuples_from_pb) \
          .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \
          .reduceByKey(lambda x, y: x)

        records.foreachPartition(push_vehpos_db)
        log.info("Inserted records for keys  %d-%d", lower, upper - 1)

        spark.sparkContext \
          .parallelize(keysSubrange) \
          .foreachPartition(set_vehpospb_invehpos)
        log.info("Updated IsInVehPos for keys %d-%d", lower, upper - 1)
Exemplo n.º 6
0
 def _push_vpdelays_dbtpls(rows, pqDate):
   with DBConn() as conn:
     for row in rows:
       dbtables.VPDelays.insert_row(conn, row, pqDate)
       if conn.uncommited % 1000 == 0:
         conn.commit()
     conn.commit()
Exemplo n.º 7
0
def run(spark):
  """Updates Parquet files in S3 and the PqDate table

  Args:
    spark: Spark Session object
  """

  log = utils.get_logger()

  with DBConnCommonQueries() as conn:
    dbtables.create_if_not_exists(conn, dbtables.PqDates)

  targetDates = fetch_parquet_dts()
  for targetDate in targetDates:
    keys = fetch_keys_for_date(targetDate)
    log.info("Got %d keys of %s", len(keys), str(targetDate))

    if len(keys) > 0:
      rddKeys = spark.sparkContext \
        .parallelize(keys) \
        .map(lambda x: (x, x)) \
        .partitionBy(Settings.NumPartitions) \
        .map(lambda x: x[0])

      rddVP = rddKeys \
        .flatMap(dbtables.VehPos.build_df_tuples_from_pb) \
        .map(lambda tpl: ((tpl[1], tpl[3]), tpl)) \
        .reduceByKey(lambda x, y: x).map(lambda x: x[1])

      schema = StructType([
        StructField("RouteId", StringType(), True),
        StructField("DT", TimestampType(), False),
        StructField("VehicleId", StringType(), False),
        StructField("TripId", StringType(), False),
        StructField("Lat", DoubleType(), False),
        StructField("Lon", DoubleType(), False),
        StructField("Status", IntegerType(), True),
        StructField("StopSeq", IntegerType(), True),
        StructField("StopId", StringType(), True),
      ])
      dfVP = spark.createDataFrame(rddVP, schema)
      log.info("Created dataframe for %d keys of %s", len(keys), str(targetDate))

      pqKey = targetDate.strftime("%Y%m%d")
      pqKey = '/'.join(["parquet", "VP-" + pqKey])
      pqKey = "s3a://alxga-insde/%s" % pqKey
      dfVP.write.format("parquet").mode("overwrite").save(pqKey)
      log.info("Written to Parquet %d keys of %s", len(keys), str(targetDate))
      numRecs = dfVP.count()
    else:
      numRecs = 0

    with DBConn() as conn:
      dbtables.PqDates.insert_values(conn, targetDate, len(keys), numRecs)
      conn.commit()
Exemplo n.º 8
0
def fetch_keys_for_date(dt):
  """Retrieves Protobuf files S3 keys for a Parquet date

  Args:
    dt: target Parquet file date
  """

  with DBConn() as conn:
    # we define new day to start at 8:00 UTC (3 or 4 at night Boston time)
    dt1 = datetime(dt.year, dt.month, dt.day, 8)
    dt2 = dt1 + timedelta(days=1)
    return dbtables.VehPosPb.select_protobuf_keys_between_dates(conn, dt1, dt2)
Exemplo n.º 9
0
def set_vehpospb_invehpos(objKeys):
    """Marks S3 Protobuf keys as processed into the VehPos table

  Args:
    objKeys: keys for the Protobuf S3 objects
  """

    with DBConn() as conn:
        for objKey in objKeys:
            dbtables.VehPosPb.update_invehpos(conn, objKey)
            if conn.uncommited >= 100:
                conn.commit()
        conn.commit()
Exemplo n.º 10
0
def push_vehpospb_dbtpls(tpls):
    """Pushes records to the VehPosPb table

  Args:
    tpls: records to add, each contains metadata for a single Protobuf file
  """

    with DBConn() as conn:
        for tpl in tpls:
            dbtables.VehPosPb.insert_tpl(conn, tpl)
            if conn.uncommited % 1000 == 0:
                conn.commit()
        conn.commit()
Exemplo n.º 11
0
def _db_get_routeids():
    sqlStmt = """
    SELECT DISTINCT RouteId FROM RouteStops
    WHERE RouteId <> 'ALLROUTES' AND RouteId <> 'ALLTRAINS' AND
      RouteId <> 'ALLBUSES'
    ORDER BY 1;
  """
    ret = []
    with DBConn() as con:
        cur = con.execute(sqlStmt)
        for row in cur:
            if row[0]:
                ret.append(row[0])
    return ret
Exemplo n.º 12
0
def _delete_for_pq_dates(pqDates):
    with DBConn() as conn:
        route_stops = dbtables.RouteStops.select_all(conn)
    prtn_keys = [f'{x[0]}:::[{x[1]}]' for x in route_stops]
    dynMgr = dyndb.DynDBMgr()
    mxdstr = '0' if Settings.MaxAbsDelay <= 0 else str(Settings.MaxAbsDelay)
    dynTbl = dynMgr.table(f'hlydelays{mxdstr}')
    count = 0
    total = len(prtn_keys) * len(pqDates)
    with dynTbl.batch_writer() as dynWriter:
        for pqDate in pqDates:
            sort_key = pqDate.strftime('%Y%m%d')
            for prtn_key in prtn_keys:
                tblKey = {'route_stop': prtn_key, 'date': sort_key}
                dynWriter.delete_item(tblKey)
                count += 1
                if count % 100 == 0:
                    logger.info(f'Deleted {count} of {total} keys')
Exemplo n.º 13
0
def push_vehpos_db(keyTpls):
    """Adds multiple records to the VehPos table

  Args:
    keyTpls: a tuple of the form (key, tpls) where key is unused and tpls
      are inserted into the table
  """

    with DBConn() as conn:
        tpls = []
        for keyTpl in keyTpls:
            tpls.append(keyTpl[1])
            if len(tpls) >= 100:
                dbtables.VehPos.insert_tpls(conn, tpls)
                conn.commit()
                tpls = []
        if len(tpls) > 0:
            dbtables.VehPos.insert_tpls(conn, tpls)
            conn.commit()
Exemplo n.º 14
0
def main():
    """Compresses text files from obsolete feeds in S3, uploads archives to S3,
  and removes the text files

  Uses tar.bz2 format
  """
    s3Mgr = s3.S3Mgr()
    objKey = '/'.join(["GTFS", "MBTA_archived_feeds.txt"])
    content = s3Mgr.fetch_object_body(objKey)
    feedDescs = gtfs.read_feed_descs(content)

    with DBConn() as conn:
        dtNow = dbtables.PqDates.select_latest_processed(conn)
    if not dtNow:
        return
    logger.info('Latest processed parquet date is %s' % str(dtNow))

    for fd in feedDescs:
        daysDiff = (dtNow - fd.endDate).total_seconds() / (24 * 3600)
        if daysDiff > Settings.GTFS_ObsoleteAfterDays:
            archive_gtfs_files(s3Mgr, fd)
Exemplo n.º 15
0
def fetch_parquet_dts():
  """Computes dates for which Parquet files need to be created
  """

  ret = []
  pfxDT = datetime(2020, 1, 1)
  utcNow = datetime.utcnow()
  dts = []
  # we define new day to start at 8:00 UTC (3 or 4 at night Boston time)
  while pfxDT + timedelta(days=1, hours=8, minutes=10) < utcNow:
    dts.append(pfxDT)
    pfxDT += timedelta(days=1)

  with DBConn() as conn:
    exD = dbtables.PqDates.select_existing_pqdates(conn)

  for dt in dts:
    if dt.date() not in exD:
      ret.append(dt)

  return ret
Exemplo n.º 16
0
def _get_stopnames_db(routeId, q):
    sqlStmt = """
    SELECT DISTINCT StopName FROM RouteStops
  """
    if routeId:
        sqlStmt += " WHERE RouteId = %s"
        params = (routeId, )
    else:
        params = None
    sqlStmt += " ORDER BY 1;"

    ret = []
    with DBConn() as con:
        cur = con.execute(sqlStmt, params)
        for row in cur:
            if not row[0] or row[0] == 'ALLSTOPS':
                continue
            stopName = row[0].strip()
            stopNameLower = stopName.lower()
            if stopNameLower.startswith(q):
                ret.append(stopName)
    return ret
Exemplo n.º 17
0
def run(spark):
    """Combines GTFS schedule feed with vehicle positions Parquet files
  and updates the VPDelays and HlyDelays tables

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.HlyDelays)

    feedDescs = GTFSFetcher.read_feed_descs()
    curFeedDesc = None
    dfStopTimes = None
    feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"]

    gtfsFetcher = GTFSFetcher(spark)
    with DBConn() as conn:
        entriesToProcess = dbtables.PqDates \
          .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelaysS3')
    for targetDate in entriesToProcess:
        if dfStopTimes is None or not curFeedDesc.includes_date(targetDate):
            curFeedDesc = None
            dfStopTimes = None
            for fd in feedDescs:
                if fd.includes_date(targetDate) and fd.includes_files(
                        feedRequiredFiles):
                    curFeedDesc = fd
                    dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc)
                    log.info('USING FEED "%s" for %s', curFeedDesc.version,
                             targetDate.strftime("%Y-%m-%d"))
                    break
        else:
            log.info('RE-USING FEED "%s" for %s', curFeedDesc.version,
                     targetDate.strftime("%Y-%m-%d"))

        if dfStopTimes:
            dfVehPos = read_vp_parquet(spark, targetDate)

            calcVPDelays = \
              VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos)
            dfVPDelays = calcVPDelays.create_result_df()

            cols_order = [
                'RouteId', 'StopName', 'DateEST', 'HourEST', 'AvgDelay',
                'AvgDist', 'Cnt'
            ]
            calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays)
            dfHlyDelays = calcHlyDelays.create_result_df().persist()
            dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays) \
              .withColumn('RouteId', F.lit('ALLROUTES'))
            dfGrpAll = calcHlyDelays.group_all(dfHlyDelays) \
              .withColumn('RouteId', F.lit('ALLROUTES')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfHlyDelaysBus = dfHlyDelays.filter(
                dfHlyDelays.RouteId.rlike("^[0-9]"))
            dfHlyDelaysTrain = dfHlyDelays.filter(
                ~dfHlyDelays.RouteId.rlike("^[0-9]"))
            dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus) \
              .withColumn('RouteId', F.lit('ALLBUSES'))
            dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus) \
              .withColumn('RouteId', F.lit('ALLBUSES')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))
            dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain) \
              .withColumn('RouteId', F.lit('ALLTRAINS'))
            dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain) \
              .withColumn('RouteId', F.lit('ALLTRAINS')) \
              .withColumn('StopName', F.lit('ALLSTOPS'))

            dfAllHly = dfHlyDelays[cols_order] \
              .union(dfGrpRoutes[cols_order]) \
              .union(dfGrpStops[cols_order]) \
              .union(dfGrpAll[cols_order]) \
              .union(dfGrpStopsBus[cols_order]) \
              .union(dfGrpAllBus[cols_order]) \
              .union(dfGrpStopsTrain[cols_order]) \
              .union(dfGrpAllTrain[cols_order])

            with DBConnCommonQueries() as conn:
                dbtables.create_if_not_exists(conn, dbtables.RouteStops)
                data = dfAllHly[['RouteId', 'StopName']] \
                  .distinct() \
                  .collect()
                dbtables.RouteStops.insert_values(conn, data)
                conn.commit()

            calcHlyDelays.update_s3(dfAllHly, targetDate)

            with DBConn() as conn:
                dbtables.PqDates.update_in_delays(conn, targetDate,
                                                  "IsInHlyDelaysS3")
                conn.commit()
Exemplo n.º 18
0
def run(spark):
  """Combines GTFS schedule feed with vehicle positions Parquet files
  and updates the VPDelays and HlyDelays tables

  Args:
    spark: Spark Session object
  """

  log = utils.get_logger()

  with DBConnCommonQueries() as conn:
    dbtables.create_if_not_exists(conn, dbtables.VPDelays)
    dbtables.create_if_not_exists(conn, dbtables.HlyDelays)

  feedDescs = GTFSFetcher.read_feed_descs()
  curFeedDesc = None
  dfStopTimes = None
  feedRequiredFiles = ["stops.txt", "stop_times.txt", "trips.txt"]

  gtfsFetcher = GTFSFetcher(spark)
  # with DBConn() as conn:
  #   entriesToProcess = dbtables.PqDates \
  #     .select_pqdates_not_in_delays(conn, 'NOT IsInHlyDelays')
  entriesToProcess = [date(2020, 8, 20)]
  for targetDate in entriesToProcess:
    if dfStopTimes is None or not curFeedDesc.includes_date(targetDate):
      curFeedDesc = None
      dfStopTimes = None
      for fd in feedDescs:
        if fd.includes_date(targetDate) and fd.includes_files(feedRequiredFiles):
          curFeedDesc = fd
          dfStopTimes = gtfsFetcher.read_stop_times(curFeedDesc)
          log.info('USING FEED "%s" for %s', curFeedDesc.version,
                   targetDate.strftime("%Y-%m-%d"))
          break
    else:
      log.info('RE-USING FEED "%s" for %s', curFeedDesc.version,
               targetDate.strftime("%Y-%m-%d"))

    if dfStopTimes:
      dfVehPos = read_vp_parquet(spark, targetDate)

      calcVPDelays = \
        VPDelaysCalculator(spark, targetDate, dfStopTimes, dfVehPos)
      dfVPDelays = calcVPDelays.create_result_df()

      with DBConn() as conn:
        dbtables.VPDelays.delete_for_parquet(conn, targetDate)
        conn.commit()
      calcVPDelays.update_db(dfVPDelays)

      calcHlyDelays = HlyDelaysCalculator(spark, dfVPDelays)
      dfHlyDelays = calcHlyDelays.create_result_df().persist()
      dfGrpRoutes = calcHlyDelays.group_routes(dfHlyDelays)
      dfGrpStops = calcHlyDelays.group_stops(dfHlyDelays)
      dfGrpAll = calcHlyDelays.group_all(dfHlyDelays)
      dfHlyDelaysBus = dfHlyDelays.filter(dfHlyDelays.RouteId.rlike("^[0-9]"))
      dfHlyDelaysTrain = dfHlyDelays.filter(~dfHlyDelays.RouteId.rlike("^[0-9]"))
      dfGrpStopsBus = calcHlyDelays.group_stops(dfHlyDelaysBus)
      dfGrpAllBus = calcHlyDelays.group_all(dfHlyDelaysBus)
      dfGrpStopsTrain = calcHlyDelays.group_stops(dfHlyDelaysTrain)
      dfGrpAllTrain = calcHlyDelays.group_all(dfHlyDelaysTrain)

      with DBConn() as conn:
        dbtables.HlyDelays.delete_for_parquet(conn, targetDate)
        conn.commit()

      calcHlyDelays.update_db(dfHlyDelays, targetDate)
      calcHlyDelays.update_db(dfGrpRoutes, targetDate)
      calcHlyDelays.update_db(dfGrpStops, targetDate)
      calcHlyDelays.update_db(dfGrpAll, targetDate)
      calcHlyDelays.update_db(dfGrpStopsBus, targetDate, "ALLBUSES")
      calcHlyDelays.update_db(dfGrpAllBus, targetDate, "ALLBUSES")
      calcHlyDelays.update_db(dfGrpStopsTrain, targetDate, "ALLTRAINS")
      calcHlyDelays.update_db(dfGrpAllTrain, targetDate, "ALLTRAINS")