Пример #1
0
def main(execution_date, **kwargs):
    fs = get_fs()
    bucket = get_bucket()

    successes = get_successfully_downloaded_feeds(execution_date)

    gtfs_file = []
    for ii, row in successes.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        agency_url = f"{bucket}/schedule/{execution_date}/{agency_folder}"

        dir_files = [x for x in fs.listdir(agency_url) if x["type"] == "file"]

        for x in dir_files:
            gtfs_file.append(
                {
                    "calitp_itp_id": row["itp_id"],
                    "calitp_url_number": row["url_number"],
                    "calitp_extracted_at": execution_date.to_date_string(),
                    "full_path": x["name"],
                    "name": x["name"].split("/")[-1],
                    "size": x["size"],
                    "md5_hash": x["md5Hash"],
                }
            )

    res = pd.DataFrame(gtfs_file)

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
Пример #2
0
def main(execution_date, **kwargs):
    logger = structlog.get_logger()
    # run for both execution date and the day before
    # this ensures that ALL files are eventually picked up
    dates = [execution_date, execution_date.subtract(days=1)]
    fs = get_fs()
    for day in dates:
        date_string = day.to_date_string()
        glob_daily_files(date_string, fs, logger)
Пример #3
0
def validate_extract(
    zip_path: Path,
    output_dir: Path,
    jar_path: Path = os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY),
    verbose=False,
) -> None:
    """"""
    execute_schedule_validator(
        fs=get_fs(),
        zip_path=zip_path,
        output_dir=output_dir,
        jar_path=jar_path,
        verbose=verbose,
    )
Пример #4
0
def get_schedule_extract_for_day_and_url(dt: pendulum.Date,
                                         url: str) -> GTFSScheduleFeedExtract:
    file = get_latest_file(
        GTFSScheduleFeedExtract.bucket,
        GTFSScheduleFeedExtract.table,
        prefix_partitions={
            "dt": dt,
            "base64_url": url,
        },
        partition_types={
            "ts": pendulum.DateTime,
        },
    )
    return parse_obj_as(
        GTFSScheduleFeedExtract,
        json.loads(get_fs().getxattr(file.name,
                                     PARTITIONED_ARTIFACT_METADATA_KEY)),
    )
Пример #5
0
def identify_files(glob,
                   rt_file_type: RTFileType,
                   progress=False) -> List[RTFile]:
    fs = get_fs()
    typer.secho("Globbing rt bucket {}".format(glob), fg=typer.colors.MAGENTA)

    before = pendulum.now()
    ticks = fs.glob(glob)
    typer.echo(
        f"globbing {len(ticks)} ticks took {(pendulum.now() - before).in_words(locale='en')}"
    )

    files = []
    if progress:
        ticks = tqdm(ticks, desc="")
    for tick in ticks:
        tick_dt = pendulum.parse(Path(tick).name)  # I love this
        files_in_tick = [
            filename for filename in fs.ls(tick)
            if rt_file_type.name in filename
        ]

        for fname in files_in_tick:
            # This is bad
            itp_id, url = fname.split("/")[-3:-1]
            files.append(
                RTFile(
                    file_type=rt_file_type,
                    path=fname,
                    itp_id=itp_id,
                    url=url,
                    tick=tick_dt,
                ))

    # for some reason the fs.glob command takes up a fair amount of memory here,
    # and does not seem to free it after the function returns, so we manually clear
    # its caches (at least the ones I could find)
    fs.dircache.clear()

    typer.secho(f"found {len(files)} {rt_file_type} files in glob {glob}",
                fg=typer.colors.GREEN)
    return files
Пример #6
0
def get_airtable_gtfs_records_for_day(
    dt: pendulum.Date, ) -> Dict[str, AirtableGTFSDataRecord]:
    file = get_latest_file(
        AirtableGTFSDataExtract.bucket,
        AirtableGTFSDataExtract.table,
        prefix_partitions={
            "dt": dt,
        },
        partition_types={
            "ts": pendulum.DateTime,
        },
    )

    with get_fs().open(file.name, "rb") as f:
        content = gzip.decompress(f.read())
    records = [
        AirtableGTFSDataRecord(**json.loads(row))
        for row in content.decode().splitlines()
    ]

    return {record.id: record for record in records}
Пример #7
0
def validation_notice_fields():
    bucket = get_bucket()

    print(f"Globbing: {bucket}/schedule/processed/*/validation_report.json")

    fs = get_fs()
    reports = fs.glob(f"{bucket}/schedule/processed/*/validation_report.json")

    code_fields = defaultdict(lambda: set())

    print(f"Iterating through {len(reports)} reports")
    for fname in reports:
        report = json.load(fs.open(fname))
        # one entry per code (e.g. the code: invalid phone number)
        for notice in report["notices"]:
            # one entry per specific code violation (e.g. each invalid phone number)
            for entry in notice["notices"]:
                # map each code to the fields in its notice
                # (e.g. duplicate_route_name has a duplicatedField field
                for field_name, value in entry.items():
                    if isinstance(value, dict):
                        # handle the few cases where there's one level of nesting
                        sub_fields = [field_name + "." + v for v in value]
                        code_fields[notice["code"]].update(sub_fields)
                    else:
                        # handle the common case of no sub-objects
                        code_fields[notice["code"]].update(entry.keys())

    validation_json_fields = pd.DataFrame({
        "code":
        code_fields.keys(),
        "field":
        list(map(list, code_fields.values()))
    }).explode("field")

    write_table(validation_json_fields,
                "gtfs_schedule_history.validation_notice_fields")
Пример #8
0
def main(execution_date, ti, **kwargs):
    fs = get_fs()
    bucket = get_bucket()
    successes = get_successfully_downloaded_feeds(execution_date)

    ttl_feeds_copied = 0
    for k, row in successes.iterrows():
        date_string = execution_date.to_date_string()

        # only handle today's updated data (backfill dag to run all) ----

        # copy processed validator results ----
        id_and_url = f"{row['itp_id']}_{row['url_number']}"
        src_validator = "/".join([
            bucket,
            "schedule",
            str(execution_date),
            id_and_url,
            "processed",
            constants.VALIDATION_REPORT,
        ])
        dst_validator = "/".join([
            bucket,
            "schedule",
            "processed",
            f"{date_string}_{id_and_url}",
            constants.VALIDATION_REPORT,
        ])

        print(f"Copying from {src_validator} to {dst_validator}")

        fs.copy(src_validator, dst_validator)

        ttl_feeds_copied += 1

    print("total feeds copied:", ttl_feeds_copied)
Пример #9
0
def main(
    step: RTProcessingStep,
    feed_type: GTFSFeedType,
    hour: datetime.datetime,
    limit: int = 0,
    progress: bool = typer.Option(
        False,
        help=
        "If true, display progress bar; useful for development but not in production.",
    ),
    threads: int = 4,
    jar_path: Path = JAR_DEFAULT,
    verbose: bool = False,
):
    pendulum_hour = pendulum.instance(hour, tz="Etc/UTC")
    files: List[GTFSRTFeedExtract] = fetch_all_in_partition(
        cls=GTFSRTFeedExtract,
        fs=get_fs(),
        partitions={
            "dt": pendulum_hour.date(),
            "hour": pendulum_hour,
        },
        table=feed_type,
        verbose=True,
        progress=progress,
    )

    rt_aggs: Dict[Tuple[pendulum.DateTime, str],
                  List[GTFSRTFeedExtract]] = defaultdict(list)

    for file in files:
        rt_aggs[(file.hour, file.base64_url)].append(file)

    aggregations_to_process = [
        RTHourlyAggregation(
            step=step,
            filename=f"{feed_type}{JSONL_GZIP_EXTENSION}",
            feed_type=feed_type,
            hour=hour,
            base64_url=url,
            extracts=files,
        ) for (hour, url), files in rt_aggs.items()
    ]

    typer.secho(
        f"found {len(files)} {feed_type} files in {len(aggregations_to_process)} aggregations to process",
        fg=typer.colors.MAGENTA,
    )

    if limit:
        typer.secho(f"limit of {limit} feeds was set", fg=typer.colors.YELLOW)
        aggregations_to_process = list(
            sorted(aggregations_to_process,
                   key=lambda feed: feed.path))[:limit]

    pbar = tqdm(total=len(aggregations_to_process)) if progress else None

    outcomes: List[RTFileProcessingOutcome] = []
    exceptions = []

    # from https://stackoverflow.com/a/55149491
    # could be cleaned up a bit with a namedtuple

    # gcfs does not seem to play nicely with multiprocessing right now, so use threads :(
    # https://github.com/fsspec/gcsfs/issues/379

    with tempfile.TemporaryDirectory() as tmp_dir:
        with ThreadPoolExecutor(max_workers=threads) as pool:
            futures: Dict[Future, RTHourlyAggregation] = {
                pool.submit(
                    parse_and_validate,
                    hour=hour,
                    jar_path=jar_path,
                    tmp_dir=tmp_dir,
                    verbose=verbose,
                    pbar=pbar,
                ): hour
                for hour in aggregations_to_process
            }

            for future in concurrent.futures.as_completed(futures):
                hour = futures[future]
                if pbar:
                    pbar.update(1)
                try:
                    outcomes.extend(future.result())
                except KeyboardInterrupt:
                    raise
                except Exception as e:
                    log(
                        f"WARNING: exception {type(e)} {str(e)} bubbled up to top for {hour.path}\n{traceback.format_exc()}",
                        err=True,
                        fg=typer.colors.RED,
                        pbar=pbar,
                    )
                    exceptions.append((e, hour.path, traceback.format_exc()))

    if pbar:
        del pbar

    assert len(outcomes) == len(
        files), f"we ended up with {len(outcomes)} outcomes from {len(files)}"
    result = GTFSRTJobResult(
        # TODO: these seem weird...
        hour=aggregations_to_process[0].hour,
        filename=aggregations_to_process[0].filename.removesuffix(".gz"),
        step=step,
        feed_type=feed_type,
        outcomes=outcomes,
    )
    save_job_result(get_fs(), result)

    if exceptions:
        exc_str = "\n".join(str(tup) for tup in exceptions)
        msg = f"got {len(exceptions)} exceptions from processing {len(aggregations_to_process)} feeds:\n{exc_str}"
        typer.secho(msg, err=True, fg=typer.colors.RED)
        raise RuntimeError(msg)

    typer.secho("fin.", fg=typer.colors.MAGENTA)
Пример #10
0
def parse_and_validate(
    hour: RTHourlyAggregation,
    jar_path: Path,
    tmp_dir: str,
    verbose: bool = False,
    pbar=None,
) -> List[RTFileProcessingOutcome]:
    fs = get_fs()
    dst_path_rt = f"{tmp_dir}/rt_{hour.name_hash}/"
    get_with_retry(
        fs,
        rpath=[file.path for file in hour.extracts],
        lpath=[
            os.path.join(dst_path_rt, file.timestamped_filename)
            for file in hour.extracts
        ],
    )

    if hour.step == RTProcessingStep.validate:
        if not hour.extracts[0].config.schedule_to_use_for_rt_validation:
            return [
                RTFileProcessingOutcome(
                    step=hour.step,
                    success=False,
                    extract=extract,
                    exception=NoScheduleDataSpecified(),
                ) for extract in hour.extracts
            ]

        try:
            return validate_and_upload(
                fs=fs,
                jar_path=jar_path,
                dst_path_rt=dst_path_rt,
                tmp_dir=tmp_dir,
                hour=hour,
                verbose=verbose,
                pbar=pbar,
            )
        except (ScheduleDataNotFound, subprocess.CalledProcessError) as e:
            if verbose:
                log(
                    f"{str(e)} thrown for {hour.path}",
                    fg=typer.colors.RED,
                    pbar=pbar,
                )

            return [
                RTFileProcessingOutcome(
                    step=hour.step,
                    success=False,
                    extract=extract,
                    exception=e,
                ) for extract in hour.extracts
            ]

    if hour.step == RTProcessingStep.parse:
        return parse_and_upload(
            fs=fs,
            dst_path_rt=dst_path_rt,
            tmp_dir=tmp_dir,
            hour=hour,
            verbose=verbose,
            pbar=pbar,
        )

    raise RuntimeError("we should not be here")
def main(execution_date, ti, **kwargs):
    tables = get_table(f"{DATASET}.calitp_included_gtfs_tables", as_df=True)

    # TODO: replace w/ pybigquery pulling schemas directly from tables
    # pull schemas from external table tasks. these tasks only run once, so their
    # xcom data is stored as a prior date.
    schemas = [
        get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name
    ]
    # ti.xcom_pull(
    #     dag_id="gtfs_schedule_history", task_ids=tables, include_prior_dates=True
    # )

    # fetch latest feeds that need loading  from warehouse ----
    date_string = execution_date.to_date_string()

    tbl_feed = get_table(f"{DATASET}.calitp_feed_updates")
    q_today = tbl_feed.select().where(
        tbl_feed.c.calitp_extracted_at == date_string)

    df_latest_updates = (pd.read_sql(q_today, q_today.bind).rename(
        columns=lambda s: s.replace("calitp_", "")).convert_dtypes())

    # this zip needs to be converted to a list in order to be iterated through multiple
    # times in an inner loop per each feed update below. This resolves a regression as
    # described in https://github.com/cal-itp/data-infra/issues/848.
    table_details = list(zip(tables.file_name, tables.is_required, schemas))
    fs = get_fs()
    bucket = get_bucket()

    # load new feeds ----
    print(f"Number of feeds being loaded: {df_latest_updates.shape[0]}")

    ttl_feeds_copied = 0
    feed_tables_process_results = []
    feed_process_resuls = []
    for k, row in df_latest_updates.iterrows():
        # initialize variable to track whether a parsing error occurred and which tables
        # were loaded so far
        parse_error_encountered_in_this_feed = False
        id_and_url = f"{row['itp_id']}_{row['url_number']}"

        # process and copy over tables into external table folder ----
        for table_file, is_required, colnames in table_details:
            # validation report handled in a separate task, since it is in a subfolder
            # and should be ran separately in case the feed is unparseable.
            if table_file == constants.VALIDATION_REPORT:
                continue

            src_path = "/".join(
                ["schedule",
                 str(execution_date), id_and_url, table_file])
            dst_path = "/".join([
                "schedule", "processed", f"{date_string}_{id_and_url}",
                table_file
            ])

            print(f"Copying from {src_path} to {dst_path}")

            if not is_required and not fs.exists(f"{bucket}/{src_path}"):
                print(f"Skipping missing optional file: {src_path}")
            else:
                parse_error_encountered = False
                try:
                    _keep_columns(
                        src_path,
                        dst_path,
                        colnames,
                        row["itp_id"],
                        row["url_number"],
                        date_string,
                    )
                except ParserError:
                    print(
                        f"Fatal parsing error encountered in {table_file} for id and "
                        "URL: {id_and_url}.")
                    parse_error_encountered = True
                    parse_error_encountered_in_this_feed = True

                feed_tables_process_results.append({
                    "calitp_itp_id":
                    row["itp_id"],
                    "calitp_url_number":
                    row["url_number"],
                    "calitp_extracted_at":
                    execution_date.to_date_string(),
                    "filename":
                    table_file,
                    "parse_error_encountered":
                    parse_error_encountered,
                })

        # note the parse result for this feed
        feed_process_resuls.append({
            "calitp_itp_id":
            row["itp_id"],
            "calitp_url_number":
            row["url_number"],
            "calitp_extracted_at":
            execution_date.to_date_string(),
            "parse_error_encountered":
            parse_error_encountered_in_this_feed,
        })

        ttl_feeds_copied += 1

        print("total feeds copied:", ttl_feeds_copied)

    # save feed and feed table process results to external tables
    save_to_gcfs(
        pd.DataFrame(feed_process_resuls).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_parse_result.csv",
        use_pipe=True,
    )
    save_to_gcfs(
        pd.DataFrame(feed_tables_process_results).to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/feed_tables_parse_result.csv",
        use_pipe=True,
    )
Пример #12
0
def download_all(task_instance, execution_date, **kwargs):
    start = pendulum.now()
    # https://stackoverflow.com/a/61808755
    with create_session() as session:
        auth_dict = {var.key: var.val for var in session.query(Variable)}

    records = [
        record for record in AirtableGTFSDataExtract.get_latest().records if
        record.data_quality_pipeline and record.data == GTFSFeedType.schedule
    ]
    outcomes: List[AirtableGTFSDataRecordProcessingOutcome] = []

    logging.info(f"processing {len(records)} records")

    for i, record in enumerate(records, start=1):
        logging.info(f"attempting to fetch {i}/{len(records)} {record.uri}")

        try:
            # this is a bit hacky but we need this until we split off auth query params from the URI itself
            jinja_pattern = r"(?P<param_name>\w+)={{\s*(?P<param_lookup_key>\w+)\s*}}"
            match = re.search(jinja_pattern, record.uri)
            if match:
                record.auth_query_param = {
                    match.group("param_name"): match.group("param_lookup_key")
                }
                record.uri = re.sub(jinja_pattern, "", record.uri)

            extract, content = download_feed(
                record,
                auth_dict=auth_dict,
                ts=start,
            )

            extract.save_content(fs=get_fs(), content=content)

            outcomes.append(
                AirtableGTFSDataRecordProcessingOutcome(
                    success=True,
                    airtable_record=record,
                    extract=extract,
                ))
        except Exception as e:
            logging.error(
                f"exception occurred while attempting to download feed {record.uri}: {str(e)}\n{traceback.format_exc()}"
            )
            outcomes.append(
                AirtableGTFSDataRecordProcessingOutcome(
                    success=False,
                    exception=e,
                    airtable_record=record,
                ))

    # TODO: save the outcomes somewhere

    print(
        f"took {humanize.naturaltime(pendulum.now() - start)} to process {len(records)} records"
    )

    assert len(records) == len(
        outcomes
    ), f"we somehow ended up with {len(outcomes)} outcomes from {len(records)} records"

    result = DownloadFeedsResult(
        ts=start,
        end=pendulum.now(),
        outcomes=outcomes,
        filename="results.jsonl",
    )

    result.save(get_fs())

    print(f"successfully fetched {len(result.successes)} of {len(records)}")

    if result.failures:
        print(
            "Failures:\n",
            "\n".join(
                str(f.exception) or str(type(f.exception))
                for f in result.failures),
        )
        # use pandas begrudgingly for email HTML since the old task used it
        html_report = pd.DataFrame(
            f.dict() for f in result.failures).to_html(border=False)

        html_content = f"""\
    NOTE: These failures come from the v2 of the GTFS Schedule downloader.

    The following agency GTFS feeds could not be extracted on {start.to_iso8601_string()}:

    {html_report}
    """
    else:
        html_content = "All feeds were downloaded successfully!"

    if is_development():
        print(
            f"Skipping since in development mode! Would have emailed {len(result.failures)} failures."
        )
    else:
        send_email(
            to=[
                "*****@*****.**",
                "*****@*****.**",
            ],
            html_content=html_content,
            subject=
            (f"Operator GTFS Errors for {datetime.datetime.now().strftime('%Y-%m-%d')}"
             ),
        )

    success_rate = len(result.successes) / len(records)
    if success_rate < GTFS_FEED_LIST_ERROR_THRESHOLD:
        raise RuntimeError(
            f"Success rate: {success_rate:.3f} was below error threshold: {GTFS_FEED_LIST_ERROR_THRESHOLD}"
        )
Пример #13
0
def validate_day(
    day: datetime = typer.Argument(
        ...,
        help="The date of data to validate.",
        formats=["%Y-%m-%d"],
    ),
) -> None:
    day = pendulum.instance(day).date()

    extracts: List[GTFSFeedExtractInfo] = fetch_all_in_partition(
        cls=GTFSFeedExtractInfo,
        bucket=SCHEDULE_RAW_BUCKET,
        table=GTFSFeedType.schedule,
        fs=get_fs(),
        partitions={
            "dt": day,
        },
        verbose=True,
    )

    if not extracts:
        typer.secho(
            "WARNING: found 0 extracts to process, exiting",
            fg=typer.colors.YELLOW,
        )
        return

    typer.secho(
        f"found {len(extracts)} to process for {day}",
        fg=typer.colors.MAGENTA,
    )
    fs = get_fs()
    outcomes = []

    for i, extract in enumerate(extracts):
        typer.secho(f"processing {i} of {len(extracts)}")
        try:
            with tempfile.TemporaryDirectory() as tmp_dir:
                zip_path = os.path.join(tmp_dir, extract.filename)
                typer.secho(
                    f"downloading {extract.path} to {zip_path}",
                    fg=typer.colors.GREEN,
                )
                fs.get_file(extract.path, zip_path)
                report, system_errors = execute_schedule_validator(
                    fs=fs,
                    zip_path=Path(zip_path),
                    output_dir=tmp_dir,
                )
            validation = GTFSScheduleFeedValidation(
                filename=f"validation_notices{JSONL_GZIP_EXTENSION}",
                extract=extract,
                system_errors=system_errors,
            )
            notices = report["notices"]
            typer.secho(
                f"saving {len(notices)} validation notices to {validation.path}",
                fg=typer.colors.GREEN,
            )
            validation.save_content(
                content=gzip.compress("\n".join(
                    json.dumps(notice) for notice in notices).encode()),
                fs=fs,
            )
            outcomes.append(
                GTFSScheduleFeedExtractValidationOutcome(
                    success=True,
                    extract=extract,
                    validation=validation,
                ))
        except Exception as e:
            typer.secho(
                f"encountered exception on extract {extract.path}: {e}\n{traceback.format_exc()}",
                fg=typer.colors.RED,
            )
            outcomes.append(
                GTFSScheduleFeedExtractValidationOutcome(
                    success=False,
                    extract=extract,
                    exception=e,
                ))
    result = ScheduleValidationJobResult(
        filename="results.jsonl",
        dt=day,
        outcomes=outcomes,
    )
    typer.secho(
        f"got {len(result.successes)} successes and {len(result.failures)} failures",
        fg=typer.colors.MAGENTA,
    )
    typer.secho(
        f"saving {len(outcomes)} to {result.path}",
        fg=typer.colors.GREEN,
    )
    result.save(fs)
    assert len(extracts) == len(
        result.outcomes
    ), f"ended up with {len(outcomes)} outcomes from {len(extracts)} extracts"
Пример #14
0
def parse_and_validate(
    hour: RTHourlyAggregation,
    jar_path: Path,
    tmp_dir: str,
    verbose: bool = False,
    parse: bool = False,
    validate: bool = False,
    pbar=None,
):
    if not parse and not validate:
        raise ValueError(
            "skipping both parsing and validation does nothing for us!")

    fs = get_fs()
    suffix = hour.suffix
    dst_path_gtfs = f"{tmp_dir}/gtfs_{suffix}/"
    os.mkdir(dst_path_gtfs)
    dst_path_rt = f"{tmp_dir}/rt_{suffix}/"
    get_with_retry(
        fs,
        rpath=[file.path for file in hour.source_files],
        lpath=[
            os.path.join(dst_path_rt, file.timestamped_filename)
            for file in hour.source_files
        ],
    )

    if validate:
        try:
            outcomes = validate_and_upload(
                fs=fs,
                jar_path=jar_path,
                dst_path_gtfs=dst_path_gtfs,
                dst_path_rt=dst_path_rt,
                tmp_dir=tmp_dir,
                hour=hour,
                verbose=verbose,
                pbar=pbar,
            )
        except (ScheduleDataNotFound, subprocess.CalledProcessError) as e:
            if verbose:
                log(
                    f"{str(e)} thrown for {hour.source_files[0].schedule_path}",
                    fg=typer.colors.RED,
                    pbar=pbar,
                )

            outcomes = [
                RTFileProcessingOutcome(
                    step="validate",
                    success=False,
                    file=file,
                    exception=e,
                    hive_path=hour.data_hive_path,
                ) for file in hour.source_files
            ]
        assert len(outcomes) == len(hour.source_files)
        upload_if_records(
            fs,
            tmp_dir,
            out_path=hour.validation_outcomes_hive_path,
            records=outcomes,
            pbar=pbar,
            verbose=verbose,
        )

    if parse:
        outcomes = parse_and_upload(
            fs=fs,
            dst_path_rt=dst_path_rt,
            tmp_dir=tmp_dir,
            hour=hour,
            verbose=verbose,
            pbar=pbar,
        )
        assert len(outcomes) == len(hour.source_files)
        upload_if_records(
            fs,
            tmp_dir,
            out_path=hour.outcomes_hive_path,
            records=outcomes,
            pbar=pbar,
            verbose=verbose,
        )
Пример #15
0
def preprocess_littlepay_provider_bucket(
    execution_date,
    aws_user,
    src_url_template=DEFAULT_SRC_URL_TEMPLATE,
    stg_dir_template=DEFAULT_STG_DIR_TEMPLATE,
    dst_dir_template=DEFAULT_DST_DIR_TEMPLATE,
):
    fs = get_fs()

    # Get high level data on tables we are pre-processing ----
    tables = get_table("payments.calitp_included_payments_tables", as_df=True)
    schemas = [
        get_table(f"{DATASET}.{t}").columns.keys() for t in tables.table_name
    ]

    # We'll save date in YYYY-MM-DD format, but littlepay uses YYYYMMDD
    # so we keep the original format for globbing all of the data files for a
    # specific day
    date_string = execution_date.to_date_string()
    date_string_narrow = date_string.replace("-", "")

    # process data for each table ----

    for table_name, columns in zip(tables.table_name, schemas):
        stg_dir = stg_dir_template.format(aws_user=aws_user,
                                          table_name=table_name)
        dst_dir = dst_dir_template.format(aws_user=aws_user,
                                          table_name=table_name)
        src_files = fs.glob(
            src_url_template.format(
                aws_user=aws_user,
                table_name=table_name.replace("_", "-"),
                date_string_narrow=date_string_narrow,
            ))

        print(
            f"\n\nTable {table_name} has {len(src_files)} new files =========")

        # remove previously processed data, in case they remove any data files ---

        dst_old_url = f"{get_bucket()}/{dst_dir}/*_{aws_user}_*"
        dst_old_files = fs.glob(dst_old_url)

        if dst_old_files:
            print(f"Deleting {len(dst_old_files)} old file(s) for "
                  f"{aws_user} {table_name}.")
            fs.rm(dst_old_files)

        # copy and process each file ----

        for fname in src_files:
            basename = fname.split("/")[-1]

            stg_fname = f"{stg_dir}/{aws_user}_{basename}"
            dst_fname = f"{dst_dir}/{date_string}_{aws_user}_{basename}"

            print(f"copying from payments bucket: {stg_fname} -> {dst_fname}")
            fs.cp(fname, f"{get_bucket()}/{stg_fname}")

            _keep_columns(
                stg_fname,
                dst_fname,
                colnames=columns,
                extracted_at=date_string,
                delimiter="|",
            )