示例#1
0
def run(spark):
    """Indexes Protobuf files by updating the S3Prefixes and VehPosPb tables

  Args:
    spark: Spark Session object
  """

    log = utils.get_logger()

    with DBConnCommonQueries() as conn:
        dbtables.create_if_not_exists(conn, dbtables.S3Prefixes)
        dbtables.create_if_not_exists(conn, dbtables.VehPosPb)

    pfxs = explore_s3prefixes()
    s3Mgr = s3.S3Mgr()
    for pfx in pfxs:
        fullPfx = '/'.join(("pb", "VehiclePos", pfx))
        keys = s3Mgr.fetch_keys(fullPfx)
        if len(keys) > 0:
            log.info("PROCESSING %d KEYS FOR %s", len(keys), pfx)
            file_list = spark.sparkContext.parallelize(keys)
            file_list \
              .map(dbtables.VehPosPb.build_tuple_from_protobuf) \
              .foreachPartition(push_vehpospb_dbtpls)
            log.info("PROCESSED %d KEYS FOR %s", len(keys), pfx)
        tpl = (pfx, len(keys))

        with DBConn() as conn:
            dbtables.S3Prefixes.insert_values(conn, pfx, len(keys))
            conn.commit()
        log.info("PUSHED S3Prefix %s", str(tpl))
def download_feed(dirName, url, *args):
  """Downloads a real-time vehicle positions file to the local storage
  and then uploads it to S3 and removes from the local storage

  Args:
    dirName: the local directory where the file will be saved initially
    url: the URL to download the file from
    *args: placeholder for any additional arguments, unused
  """

  fName = datetime.now().strftime("%Y%m%d-%H%M%S.pb")
  r = requests.get(url)
  fPath = os.path.join(Settings.ProjPath, "pb", dirName, fName)
  with open(fPath, "wb") as handle:
    handle.write(r.content)

  try:
    # always use '/' as path separator in S3
    objKey = '/'.join(["pb", dirName, fName.replace('-', '/')])
    s3Mgr = s3.S3Mgr()
    s3Mgr.upload_file(fPath, objKey)
    os.remove(fPath)

  except Exception: # pylint: disable=broad-except
    log = utils.get_logger()
    log.warning("Error while saving the file %s to S3 and/or DB", fPath)
    log.warning(traceback.format_exc())
    pass # do not interfere with other threads that might succeed
示例#3
0
  def read_feed_descs():
    """Retrieves a list of GTFS feed descriptions available on S3
    """

    objKey = '/'.join(["GTFS", "MBTA_archived_feeds.txt"])
    s3Mgr = s3.S3Mgr()
    content = s3Mgr.fetch_object_body(objKey)
    return gtfs.read_feed_descs(content)
示例#4
0
def _process_pqdate(pqDate):
    mxdstr = '0' if Settings.MaxAbsDelay <= 0 else str(Settings.MaxAbsDelay)
    s3_prefix = f"HlyDelays{mxdstr}/{pqDate.strftime('%Y%m%d')}.pq"
    rexpr = re.compile(r'.*part-.*\.parquet$')
    s3Mgr = s3.S3Mgr()
    for key in s3Mgr.fetch_keys(s3_prefix):
        if rexpr.match(key):
            key = f's3://{Settings.S3BucketName}/{key}'
            df = pd.read_parquet(key)
            _process_df(df, pqDate)
            logger.info(f'Processed entries from {key}')
    def update_s3(self, dfHlyDelays, pqDate):
        """Saves a delays dataframe to the S3 in parquet
    """

        s3Mgr = s3.S3Mgr()
        mxdstr = '0' if Settings.MaxAbsDelay <= 0 else str(
            Settings.MaxAbsDelay)
        pfx = f"HlyDelays{mxdstr}/{pqDate.strftime('%Y%m%d.pq')}"
        if s3Mgr.prefix_exists(pfx):
            s3Mgr.delete_prefix(pfx)
            time.sleep(5)

        dfHlyDelays = dfHlyDelays \
          .withColumn(
            'route_stop',
            F.concat(
              dfHlyDelays.RouteId, F.lit(':::'),
              F.lit('['), dfHlyDelays.StopName, F.lit(']')
            )
          )
        dfHlyDelays = dfHlyDelays \
          .groupBy(dfHlyDelays.route_stop) \
          .agg(
            F.collect_list(
              F.struct(
                dfHlyDelays.DateEST, dfHlyDelays.HourEST,
                dfHlyDelays.AvgDelay, dfHlyDelays.AvgDist, dfHlyDelays.Cnt
              )
            ).alias('vals_unsorted')
          )

        udf_ret_type = ArrayType(
            StructType([
                StructField("DateEST", DateType(), False),
                StructField("HourEST", IntegerType(), False),
                StructField("AvgDelay", DoubleType(), False),
                StructField("AvgDist", DoubleType(), False),
                StructField("Cnt", IntegerType(), False)
            ]))
        udf_sort_vals = F.udf(
            lambda vals: list(
                sorted(vals, key=lambda r: (r.DateEST, r.HourEST))),
            udf_ret_type)
        dfHlyDelays = dfHlyDelays \
          .withColumn('vals', udf_sort_vals(dfHlyDelays.vals_unsorted)) \
          .drop('vals_unsorted')

        dfHlyDelays.printSchema()

        s3_path = "s3a://%s/%s" % (Settings.S3BucketName, pfx)
        dfHlyDelays.write.mode("overwrite").parquet(s3_path)
示例#6
0
def upload_zips(feedDescs):
  """Checks whether all feed descriptions are available in S3, then
  retrieves and downloads the missing ones

  Args:
    feedDescs: a list of MBTA_ArchivedFeedDesc objects
  """

  s3Mgr = s3.S3Mgr()
  for feedDesc in feedDescs:
    s3Key = '/'.join(["GTFS", feedDesc.s3Key])
    archS3Key = '/'.join(["GTFS_Archived", feedDesc.s3Key])
    if not s3Mgr.prefix_exists(s3Key) and not s3Mgr.prefix_exists(archS3Key):
      upload_zip(s3Mgr, feedDesc, s3Key)
示例#7
0
def update_archive_txt():
  """Retrieves a text file describing the currently available archives from
  MBTA and stores it in S3 if it can be successfully parsed by the app
  """

  url = "https://cdn.mbta.com/archive/archived_feeds.txt"
  r = requests.get(url)

  # parse first to check if we can work with this
  feedDescs = gtfs.read_feed_descs(r.content)

  objKey = '/'.join(["GTFS", "MBTA_archived_feeds.txt"])
  s3Mgr = s3.S3Mgr()
  s3Mgr.put_object_body(objKey, r.content)

  return feedDescs
示例#8
0
def main():
    """Compresses text files from obsolete feeds in S3, uploads archives to S3,
  and removes the text files

  Uses tar.bz2 format
  """
    s3Mgr = s3.S3Mgr()
    objKey = '/'.join(["GTFS", "MBTA_archived_feeds.txt"])
    content = s3Mgr.fetch_object_body(objKey)
    feedDescs = gtfs.read_feed_descs(content)

    with DBConn() as conn:
        dtNow = dbtables.PqDates.select_latest_processed(conn)
    if not dtNow:
        return
    logger.info('Latest processed parquet date is %s' % str(dtNow))

    for fd in feedDescs:
        daysDiff = (dtNow - fd.endDate).total_seconds() / (24 * 3600)
        if daysDiff > Settings.GTFS_ObsoleteAfterDays:
            archive_gtfs_files(s3Mgr, fd)
示例#9
0
def _rollback_2020():
    s3Mgr = s3.S3Mgr()
    seasons = ['Winter', 'Spring', 'Summer', 'Fall']
    pfxs20 = [x + ' 2020' for x in seasons]
    pfx20_lens = [len(x) for x in pfxs20]
    for objKey in s3Mgr.fetch_keys('GTFS_Archived'):
        m = re.match(r'GTFS_Archived/(?P<feedname>.*)\.tar.bz2', objKey)
        if not m:
            continue
        feedname = m['feedname']
        for pfx, pfx_len in zip(pfxs20, pfx20_lens):
            if feedname[0:pfx_len] == pfx:
                logger.info(f'Proceeding with {objKey}')
                data = s3Mgr.fetch_object_body(objKey)
                buffer = BytesIO(data)
                tbz2 = tarfile.open(mode="r:bz2", fileobj=buffer)
                for member in tbz2.getmembers():
                    extr_key = '/'.join(['GTFS', feedname, member.name])
                    if s3Mgr.prefix_exists(extr_key):
                        logger.warning(f'Prefix {extr_key} already exists')
                        continue
                    s3Mgr.put_object_body(extr_key,
                                          tbz2.extractfile(member).read())
                s3Mgr.delete_key(objKey)