def main(execution_date, **kwargs): fs = get_fs() bucket = get_bucket() successes = get_successfully_downloaded_feeds(execution_date) gtfs_file = [] for ii, row in successes.iterrows(): agency_folder = f"{row.itp_id}_{row.url_number}" agency_url = f"{bucket}/schedule/{execution_date}/{agency_folder}" dir_files = [x for x in fs.listdir(agency_url) if x["type"] == "file"] for x in dir_files: gtfs_file.append( { "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "full_path": x["name"], "name": x["name"].split("/")[-1], "size": x["size"], "md5_hash": x["md5Hash"], } ) res = pd.DataFrame(gtfs_file) save_to_gcfs( res.to_csv(index=False).encode(), f"schedule/{execution_date}/processed/files.csv", use_pipe=True, )
def main(execution_date, **kwargs): logger = structlog.get_logger() # run for both execution date and the day before # this ensures that ALL files are eventually picked up dates = [execution_date, execution_date.subtract(days=1)] fs = get_fs() for day in dates: date_string = day.to_date_string() glob_daily_files(date_string, fs, logger)
def validate_extract( zip_path: Path, output_dir: Path, jar_path: Path = os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY), verbose=False, ) -> None: """""" execute_schedule_validator( fs=get_fs(), zip_path=zip_path, output_dir=output_dir, jar_path=jar_path, verbose=verbose, )
def get_schedule_extract_for_day_and_url(dt: pendulum.Date, url: str) -> GTFSScheduleFeedExtract: file = get_latest_file( GTFSScheduleFeedExtract.bucket, GTFSScheduleFeedExtract.table, prefix_partitions={ "dt": dt, "base64_url": url, }, partition_types={ "ts": pendulum.DateTime, }, ) return parse_obj_as( GTFSScheduleFeedExtract, json.loads(get_fs().getxattr(file.name, PARTITIONED_ARTIFACT_METADATA_KEY)), )
def identify_files(glob, rt_file_type: RTFileType, progress=False) -> List[RTFile]: fs = get_fs() typer.secho("Globbing rt bucket {}".format(glob), fg=typer.colors.MAGENTA) before = pendulum.now() ticks = fs.glob(glob) typer.echo( f"globbing {len(ticks)} ticks took {(pendulum.now() - before).in_words(locale='en')}" ) files = [] if progress: ticks = tqdm(ticks, desc="") for tick in ticks: tick_dt = pendulum.parse(Path(tick).name) # I love this files_in_tick = [ filename for filename in fs.ls(tick) if rt_file_type.name in filename ] for fname in files_in_tick: # This is bad itp_id, url = fname.split("/")[-3:-1] files.append( RTFile( file_type=rt_file_type, path=fname, itp_id=itp_id, url=url, tick=tick_dt, )) # for some reason the fs.glob command takes up a fair amount of memory here, # and does not seem to free it after the function returns, so we manually clear # its caches (at least the ones I could find) fs.dircache.clear() typer.secho(f"found {len(files)} {rt_file_type} files in glob {glob}", fg=typer.colors.GREEN) return files
def get_airtable_gtfs_records_for_day( dt: pendulum.Date, ) -> Dict[str, AirtableGTFSDataRecord]: file = get_latest_file( AirtableGTFSDataExtract.bucket, AirtableGTFSDataExtract.table, prefix_partitions={ "dt": dt, }, partition_types={ "ts": pendulum.DateTime, }, ) with get_fs().open(file.name, "rb") as f: content = gzip.decompress(f.read()) records = [ AirtableGTFSDataRecord(**json.loads(row)) for row in content.decode().splitlines() ] return {record.id: record for record in records}
def validation_notice_fields(): bucket = get_bucket() print(f"Globbing: {bucket}/schedule/processed/*/validation_report.json") fs = get_fs() reports = fs.glob(f"{bucket}/schedule/processed/*/validation_report.json") code_fields = defaultdict(lambda: set()) print(f"Iterating through {len(reports)} reports") for fname in reports: report = json.load(fs.open(fname)) # one entry per code (e.g. the code: invalid phone number) for notice in report["notices"]: # one entry per specific code violation (e.g. each invalid phone number) for entry in notice["notices"]: # map each code to the fields in its notice # (e.g. duplicate_route_name has a duplicatedField field for field_name, value in entry.items(): if isinstance(value, dict): # handle the few cases where there's one level of nesting sub_fields = [field_name + "." + v for v in value] code_fields[notice["code"]].update(sub_fields) else: # handle the common case of no sub-objects code_fields[notice["code"]].update(entry.keys()) validation_json_fields = pd.DataFrame({ "code": code_fields.keys(), "field": list(map(list, code_fields.values())) }).explode("field") write_table(validation_json_fields, "gtfs_schedule_history.validation_notice_fields")
def main(execution_date, ti, **kwargs): fs = get_fs() bucket = get_bucket() successes = get_successfully_downloaded_feeds(execution_date) ttl_feeds_copied = 0 for k, row in successes.iterrows(): date_string = execution_date.to_date_string() # only handle today's updated data (backfill dag to run all) ---- # copy processed validator results ---- id_and_url = f"{row['itp_id']}_{row['url_number']}" src_validator = "/".join([ bucket, "schedule", str(execution_date), id_and_url, "processed", constants.VALIDATION_REPORT, ]) dst_validator = "/".join([ bucket, "schedule", "processed", f"{date_string}_{id_and_url}", constants.VALIDATION_REPORT, ]) print(f"Copying from {src_validator} to {dst_validator}") fs.copy(src_validator, dst_validator) ttl_feeds_copied += 1 print("total feeds copied:", ttl_feeds_copied)
def main( step: RTProcessingStep, feed_type: GTFSFeedType, hour: datetime.datetime, limit: int = 0, progress: bool = typer.Option( False, help= "If true, display progress bar; useful for development but not in production.", ), threads: int = 4, jar_path: Path = JAR_DEFAULT, verbose: bool = False, ): pendulum_hour = pendulum.instance(hour, tz="Etc/UTC") files: List[GTFSRTFeedExtract] = fetch_all_in_partition( cls=GTFSRTFeedExtract, fs=get_fs(), partitions={ "dt": pendulum_hour.date(), "hour": pendulum_hour, }, table=feed_type, verbose=True, progress=progress, ) rt_aggs: Dict[Tuple[pendulum.DateTime, str], List[GTFSRTFeedExtract]] = defaultdict(list) for file in files: rt_aggs[(file.hour, file.base64_url)].append(file) aggregations_to_process = [ RTHourlyAggregation( step=step, filename=f"{feed_type}{JSONL_GZIP_EXTENSION}", feed_type=feed_type, hour=hour, base64_url=url, extracts=files, ) for (hour, url), files in rt_aggs.items() ] typer.secho( f"found {len(files)} {feed_type} files in {len(aggregations_to_process)} aggregations to process", fg=typer.colors.MAGENTA, ) if limit: typer.secho(f"limit of {limit} feeds was set", fg=typer.colors.YELLOW) aggregations_to_process = list( sorted(aggregations_to_process, key=lambda feed: feed.path))[:limit] pbar = tqdm(total=len(aggregations_to_process)) if progress else None outcomes: List[RTFileProcessingOutcome] = [] exceptions = [] # from https://stackoverflow.com/a/55149491 # could be cleaned up a bit with a namedtuple # gcfs does not seem to play nicely with multiprocessing right now, so use threads :( # https://github.com/fsspec/gcsfs/issues/379 with tempfile.TemporaryDirectory() as tmp_dir: with ThreadPoolExecutor(max_workers=threads) as pool: futures: Dict[Future, RTHourlyAggregation] = { pool.submit( parse_and_validate, hour=hour, jar_path=jar_path, tmp_dir=tmp_dir, verbose=verbose, pbar=pbar, ): hour for hour in aggregations_to_process } for future in concurrent.futures.as_completed(futures): hour = futures[future] if pbar: pbar.update(1) try: outcomes.extend(future.result()) except KeyboardInterrupt: raise except Exception as e: log( f"WARNING: exception {type(e)} {str(e)} bubbled up to top for {hour.path}\n{traceback.format_exc()}", err=True, fg=typer.colors.RED, pbar=pbar, ) exceptions.append((e, hour.path, traceback.format_exc())) if pbar: del pbar assert len(outcomes) == len( files), f"we ended up with {len(outcomes)} outcomes from {len(files)}" result = GTFSRTJobResult( # TODO: these seem weird... hour=aggregations_to_process[0].hour, filename=aggregations_to_process[0].filename.removesuffix(".gz"), step=step, feed_type=feed_type, outcomes=outcomes, ) save_job_result(get_fs(), result) if exceptions: exc_str = "\n".join(str(tup) for tup in exceptions) msg = f"got {len(exceptions)} exceptions from processing {len(aggregations_to_process)} feeds:\n{exc_str}" typer.secho(msg, err=True, fg=typer.colors.RED) raise RuntimeError(msg) typer.secho("fin.", fg=typer.colors.MAGENTA)
def parse_and_validate( hour: RTHourlyAggregation, jar_path: Path, tmp_dir: str, verbose: bool = False, pbar=None, ) -> List[RTFileProcessingOutcome]: fs = get_fs() dst_path_rt = f"{tmp_dir}/rt_{hour.name_hash}/" get_with_retry( fs, rpath=[file.path for file in hour.extracts], lpath=[ os.path.join(dst_path_rt, file.timestamped_filename) for file in hour.extracts ], ) if hour.step == RTProcessingStep.validate: if not hour.extracts[0].config.schedule_to_use_for_rt_validation: return [ RTFileProcessingOutcome( step=hour.step, success=False, extract=extract, exception=NoScheduleDataSpecified(), ) for extract in hour.extracts ] try: return validate_and_upload( fs=fs, jar_path=jar_path, dst_path_rt=dst_path_rt, tmp_dir=tmp_dir, hour=hour, verbose=verbose, pbar=pbar, ) except (ScheduleDataNotFound, subprocess.CalledProcessError) as e: if verbose: log( f"{str(e)} thrown for {hour.path}", fg=typer.colors.RED, pbar=pbar, ) return [ RTFileProcessingOutcome( step=hour.step, success=False, extract=extract, exception=e, ) for extract in hour.extracts ] if hour.step == RTProcessingStep.parse: return parse_and_upload( fs=fs, dst_path_rt=dst_path_rt, tmp_dir=tmp_dir, hour=hour, verbose=verbose, pbar=pbar, ) raise RuntimeError("we should not be here")
def main(execution_date, ti, **kwargs): tables = get_table(f"{DATASET}.calitp_included_gtfs_tables", as_df=True) # TODO: replace w/ pybigquery pulling schemas directly from tables # pull schemas from external table tasks. these tasks only run once, so their # xcom data is stored as a prior date. schemas = [ get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name ] # ti.xcom_pull( # dag_id="gtfs_schedule_history", task_ids=tables, include_prior_dates=True # ) # fetch latest feeds that need loading from warehouse ---- date_string = execution_date.to_date_string() tbl_feed = get_table(f"{DATASET}.calitp_feed_updates") q_today = tbl_feed.select().where( tbl_feed.c.calitp_extracted_at == date_string) df_latest_updates = (pd.read_sql(q_today, q_today.bind).rename( columns=lambda s: s.replace("calitp_", "")).convert_dtypes()) # this zip needs to be converted to a list in order to be iterated through multiple # times in an inner loop per each feed update below. This resolves a regression as # described in https://github.com/cal-itp/data-infra/issues/848. table_details = list(zip(tables.file_name, tables.is_required, schemas)) fs = get_fs() bucket = get_bucket() # load new feeds ---- print(f"Number of feeds being loaded: {df_latest_updates.shape[0]}") ttl_feeds_copied = 0 feed_tables_process_results = [] feed_process_resuls = [] for k, row in df_latest_updates.iterrows(): # initialize variable to track whether a parsing error occurred and which tables # were loaded so far parse_error_encountered_in_this_feed = False id_and_url = f"{row['itp_id']}_{row['url_number']}" # process and copy over tables into external table folder ---- for table_file, is_required, colnames in table_details: # validation report handled in a separate task, since it is in a subfolder # and should be ran separately in case the feed is unparseable. if table_file == constants.VALIDATION_REPORT: continue src_path = "/".join( ["schedule", str(execution_date), id_and_url, table_file]) dst_path = "/".join([ "schedule", "processed", f"{date_string}_{id_and_url}", table_file ]) print(f"Copying from {src_path} to {dst_path}") if not is_required and not fs.exists(f"{bucket}/{src_path}"): print(f"Skipping missing optional file: {src_path}") else: parse_error_encountered = False try: _keep_columns( src_path, dst_path, colnames, row["itp_id"], row["url_number"], date_string, ) except ParserError: print( f"Fatal parsing error encountered in {table_file} for id and " "URL: {id_and_url}.") parse_error_encountered = True parse_error_encountered_in_this_feed = True feed_tables_process_results.append({ "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "filename": table_file, "parse_error_encountered": parse_error_encountered, }) # note the parse result for this feed feed_process_resuls.append({ "calitp_itp_id": row["itp_id"], "calitp_url_number": row["url_number"], "calitp_extracted_at": execution_date.to_date_string(), "parse_error_encountered": parse_error_encountered_in_this_feed, }) ttl_feeds_copied += 1 print("total feeds copied:", ttl_feeds_copied) # save feed and feed table process results to external tables save_to_gcfs( pd.DataFrame(feed_process_resuls).to_csv(index=False).encode(), f"schedule/{execution_date}/processed/feed_parse_result.csv", use_pipe=True, ) save_to_gcfs( pd.DataFrame(feed_tables_process_results).to_csv(index=False).encode(), f"schedule/{execution_date}/processed/feed_tables_parse_result.csv", use_pipe=True, )
def download_all(task_instance, execution_date, **kwargs): start = pendulum.now() # https://stackoverflow.com/a/61808755 with create_session() as session: auth_dict = {var.key: var.val for var in session.query(Variable)} records = [ record for record in AirtableGTFSDataExtract.get_latest().records if record.data_quality_pipeline and record.data == GTFSFeedType.schedule ] outcomes: List[AirtableGTFSDataRecordProcessingOutcome] = [] logging.info(f"processing {len(records)} records") for i, record in enumerate(records, start=1): logging.info(f"attempting to fetch {i}/{len(records)} {record.uri}") try: # this is a bit hacky but we need this until we split off auth query params from the URI itself jinja_pattern = r"(?P<param_name>\w+)={{\s*(?P<param_lookup_key>\w+)\s*}}" match = re.search(jinja_pattern, record.uri) if match: record.auth_query_param = { match.group("param_name"): match.group("param_lookup_key") } record.uri = re.sub(jinja_pattern, "", record.uri) extract, content = download_feed( record, auth_dict=auth_dict, ts=start, ) extract.save_content(fs=get_fs(), content=content) outcomes.append( AirtableGTFSDataRecordProcessingOutcome( success=True, airtable_record=record, extract=extract, )) except Exception as e: logging.error( f"exception occurred while attempting to download feed {record.uri}: {str(e)}\n{traceback.format_exc()}" ) outcomes.append( AirtableGTFSDataRecordProcessingOutcome( success=False, exception=e, airtable_record=record, )) # TODO: save the outcomes somewhere print( f"took {humanize.naturaltime(pendulum.now() - start)} to process {len(records)} records" ) assert len(records) == len( outcomes ), f"we somehow ended up with {len(outcomes)} outcomes from {len(records)} records" result = DownloadFeedsResult( ts=start, end=pendulum.now(), outcomes=outcomes, filename="results.jsonl", ) result.save(get_fs()) print(f"successfully fetched {len(result.successes)} of {len(records)}") if result.failures: print( "Failures:\n", "\n".join( str(f.exception) or str(type(f.exception)) for f in result.failures), ) # use pandas begrudgingly for email HTML since the old task used it html_report = pd.DataFrame( f.dict() for f in result.failures).to_html(border=False) html_content = f"""\ NOTE: These failures come from the v2 of the GTFS Schedule downloader. The following agency GTFS feeds could not be extracted on {start.to_iso8601_string()}: {html_report} """ else: html_content = "All feeds were downloaded successfully!" if is_development(): print( f"Skipping since in development mode! Would have emailed {len(result.failures)} failures." ) else: send_email( to=[ "*****@*****.**", "*****@*****.**", ], html_content=html_content, subject= (f"Operator GTFS Errors for {datetime.datetime.now().strftime('%Y-%m-%d')}" ), ) success_rate = len(result.successes) / len(records) if success_rate < GTFS_FEED_LIST_ERROR_THRESHOLD: raise RuntimeError( f"Success rate: {success_rate:.3f} was below error threshold: {GTFS_FEED_LIST_ERROR_THRESHOLD}" )
def validate_day( day: datetime = typer.Argument( ..., help="The date of data to validate.", formats=["%Y-%m-%d"], ), ) -> None: day = pendulum.instance(day).date() extracts: List[GTFSFeedExtractInfo] = fetch_all_in_partition( cls=GTFSFeedExtractInfo, bucket=SCHEDULE_RAW_BUCKET, table=GTFSFeedType.schedule, fs=get_fs(), partitions={ "dt": day, }, verbose=True, ) if not extracts: typer.secho( "WARNING: found 0 extracts to process, exiting", fg=typer.colors.YELLOW, ) return typer.secho( f"found {len(extracts)} to process for {day}", fg=typer.colors.MAGENTA, ) fs = get_fs() outcomes = [] for i, extract in enumerate(extracts): typer.secho(f"processing {i} of {len(extracts)}") try: with tempfile.TemporaryDirectory() as tmp_dir: zip_path = os.path.join(tmp_dir, extract.filename) typer.secho( f"downloading {extract.path} to {zip_path}", fg=typer.colors.GREEN, ) fs.get_file(extract.path, zip_path) report, system_errors = execute_schedule_validator( fs=fs, zip_path=Path(zip_path), output_dir=tmp_dir, ) validation = GTFSScheduleFeedValidation( filename=f"validation_notices{JSONL_GZIP_EXTENSION}", extract=extract, system_errors=system_errors, ) notices = report["notices"] typer.secho( f"saving {len(notices)} validation notices to {validation.path}", fg=typer.colors.GREEN, ) validation.save_content( content=gzip.compress("\n".join( json.dumps(notice) for notice in notices).encode()), fs=fs, ) outcomes.append( GTFSScheduleFeedExtractValidationOutcome( success=True, extract=extract, validation=validation, )) except Exception as e: typer.secho( f"encountered exception on extract {extract.path}: {e}\n{traceback.format_exc()}", fg=typer.colors.RED, ) outcomes.append( GTFSScheduleFeedExtractValidationOutcome( success=False, extract=extract, exception=e, )) result = ScheduleValidationJobResult( filename="results.jsonl", dt=day, outcomes=outcomes, ) typer.secho( f"got {len(result.successes)} successes and {len(result.failures)} failures", fg=typer.colors.MAGENTA, ) typer.secho( f"saving {len(outcomes)} to {result.path}", fg=typer.colors.GREEN, ) result.save(fs) assert len(extracts) == len( result.outcomes ), f"ended up with {len(outcomes)} outcomes from {len(extracts)} extracts"
def parse_and_validate( hour: RTHourlyAggregation, jar_path: Path, tmp_dir: str, verbose: bool = False, parse: bool = False, validate: bool = False, pbar=None, ): if not parse and not validate: raise ValueError( "skipping both parsing and validation does nothing for us!") fs = get_fs() suffix = hour.suffix dst_path_gtfs = f"{tmp_dir}/gtfs_{suffix}/" os.mkdir(dst_path_gtfs) dst_path_rt = f"{tmp_dir}/rt_{suffix}/" get_with_retry( fs, rpath=[file.path for file in hour.source_files], lpath=[ os.path.join(dst_path_rt, file.timestamped_filename) for file in hour.source_files ], ) if validate: try: outcomes = validate_and_upload( fs=fs, jar_path=jar_path, dst_path_gtfs=dst_path_gtfs, dst_path_rt=dst_path_rt, tmp_dir=tmp_dir, hour=hour, verbose=verbose, pbar=pbar, ) except (ScheduleDataNotFound, subprocess.CalledProcessError) as e: if verbose: log( f"{str(e)} thrown for {hour.source_files[0].schedule_path}", fg=typer.colors.RED, pbar=pbar, ) outcomes = [ RTFileProcessingOutcome( step="validate", success=False, file=file, exception=e, hive_path=hour.data_hive_path, ) for file in hour.source_files ] assert len(outcomes) == len(hour.source_files) upload_if_records( fs, tmp_dir, out_path=hour.validation_outcomes_hive_path, records=outcomes, pbar=pbar, verbose=verbose, ) if parse: outcomes = parse_and_upload( fs=fs, dst_path_rt=dst_path_rt, tmp_dir=tmp_dir, hour=hour, verbose=verbose, pbar=pbar, ) assert len(outcomes) == len(hour.source_files) upload_if_records( fs, tmp_dir, out_path=hour.outcomes_hive_path, records=outcomes, pbar=pbar, verbose=verbose, )
def preprocess_littlepay_provider_bucket( execution_date, aws_user, src_url_template=DEFAULT_SRC_URL_TEMPLATE, stg_dir_template=DEFAULT_STG_DIR_TEMPLATE, dst_dir_template=DEFAULT_DST_DIR_TEMPLATE, ): fs = get_fs() # Get high level data on tables we are pre-processing ---- tables = get_table("payments.calitp_included_payments_tables", as_df=True) schemas = [ get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name ] # We'll save date in YYYY-MM-DD format, but littlepay uses YYYYMMDD # so we keep the original format for globbing all of the data files for a # specific day date_string = execution_date.to_date_string() date_string_narrow = date_string.replace("-", "") # process data for each table ---- for table_name, columns in zip(tables.table_name, schemas): stg_dir = stg_dir_template.format(aws_user=aws_user, table_name=table_name) dst_dir = dst_dir_template.format(aws_user=aws_user, table_name=table_name) src_files = fs.glob( src_url_template.format( aws_user=aws_user, table_name=table_name.replace("_", "-"), date_string_narrow=date_string_narrow, )) print( f"\n\nTable {table_name} has {len(src_files)} new files =========") # remove previously processed data, in case they remove any data files --- dst_old_url = f"{get_bucket()}/{dst_dir}/*_{aws_user}_*" dst_old_files = fs.glob(dst_old_url) if dst_old_files: print(f"Deleting {len(dst_old_files)} old file(s) for " f"{aws_user} {table_name}.") fs.rm(dst_old_files) # copy and process each file ---- for fname in src_files: basename = fname.split("/")[-1] stg_fname = f"{stg_dir}/{aws_user}_{basename}" dst_fname = f"{dst_dir}/{date_string}_{aws_user}_{basename}" print(f"copying from payments bucket: {stg_fname} -> {dst_fname}") fs.cp(fname, f"{get_bucket()}/{stg_fname}") _keep_columns( stg_fname, dst_fname, colnames=columns, extracted_at=date_string, delimiter="|", )