def main(execution_date, **kwargs):
    in_path = f"schedule/{execution_date}"
    print(in_path)

    status = pd.read_csv(read_gcfs(f"{in_path}/status.csv"))
    success = status[status.status == "success"]

    agency_errors = []
    loadable_agencies = []
    for ii, row in success.iterrows():
        path_agency = f"{in_path}/{row['itp_id']}_{row['url_number']}"
        path_validation = f"{path_agency}/{VALIDATION_FILE}"

        print(f"reading validation file: {path_validation}")
        validation = json.load(read_gcfs(path_validation))

        unique_codes = get_notice_codes(validation)

        if ERROR_MISSING_FILE not in unique_codes:
            loadable_agencies.append(path_agency)
        else:
            agency = dict(itp_id=row["itp_id"], url_number=row["url_number"])
            agency_errors.append(agency)

    errors_df = pd.DataFrame(agency_errors)
    errors_str = errors_df.to_csv(index=False).encode()
    save_to_gcfs(errors_str,
                 f"{in_path}/processed/agency_load_errors.csv",
                 use_pipe=True)

    return loadable_agencies
def validator_process(execution_date, **kwargs):
    base_path = f"schedule/{execution_date}"

    status = pd.read_csv(read_gcfs(f"{base_path}/status.csv"))
    success = status[lambda d: d.status == "success"]

    # hold on to notices, so we can infer schema after
    # note that I've commented out the code for inferring schema below,
    # but it was usefule for generating, then hand-tweaking to load
    # into bigquery
    # notice_entries = []
    for k, row in success.iterrows():
        agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}"
        url = f"{agency_path}/validation.json"
        dst_path = f"{agency_path}/processed/validation_report.json"

        validation = json.load(read_gcfs(url))

        # copy code-level notices, and add internal ids
        raw_codes = {**validation["data"]["report"]}
        raw_codes["calitp_itp_id"] = row["itp_id"]
        raw_codes["calitp_url_number"] = row["url_number"]
        raw_codes["calitp_gtfs_validated_by"] = validation["version"]

        json_codes = json.dumps(raw_codes).encode()
        # df_notices = process_notices(row["itp_id"], row["url_number"], validation)
        # csv_string = df_notices.to_csv(index=None).encode()
        # notice_entries.extend(df_notices.notices.tolist())

        save_to_gcfs(json_codes, dst_path, use_pipe=True)
示例#3
0
def validator_process(execution_date, **kwargs):
    base_path = f"schedule/{execution_date}"
    successes = get_successfully_downloaded_feeds(execution_date)

    # hold on to notices, so we can infer schema after
    # note that I've commented out the code for inferring schema below,
    # but it was usefule for generating, then hand-tweaking to load
    # into bigquery
    # notice_entries = []
    for k, row in successes.iterrows():
        agency_path = f"{base_path}/{row['itp_id']}_{row['url_number']}"
        url = f"{agency_path}/validation.json"
        dst_path = f"{agency_path}/processed/validation_report.json"

        validation = json.load(read_gcfs(url))

        # copy code-level notices, and add internal ids
        raw_codes = {**validation["data"]["report"]}
        raw_codes["calitp_itp_id"] = row["itp_id"]
        raw_codes["calitp_url_number"] = row["url_number"]
        raw_codes["calitp_extracted_at"] = execution_date.to_date_string()
        raw_codes["calitp_gtfs_validated_by"] = validation["version"]

        # coerce types labeled "string" to a string
        coerce_notice_values_to_str(raw_codes, COERCE_TO_STRING)

        json_codes = json.dumps(raw_codes).encode()
        # df_notices = process_notices(row["itp_id"], row["url_number"], validation)
        # csv_string = df_notices.to_csv(index=None).encode()
        # notice_entries.extend(df_notices.notices.tolist())

        save_to_gcfs(json_codes, dst_path, use_pipe=True)
def main(execution_date, **kwargs):
    # TODO: remove hard-coded project string
    fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

    bucket = get_bucket()

    f = read_gcfs(f"schedule/{execution_date}/status.csv")
    status = pd.read_csv(f)

    success = status[lambda d: d.status == "success"]

    gtfs_files = []
    for ii, row in success.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*"

        gtfs_files.append(fs.glob(gtfs_url))

    res = (success[["itp_id",
                    "url_number"]].assign(gtfs_file=gtfs_files).explode(
                        "gtfs_file").loc[lambda d: d.gtfs_file != "processed"])

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
示例#5
0
def get_successfully_downloaded_feeds(execution_date):
    """Get a list of feeds that were successfully downloaded (as noted in a
    `schedule/{execution_date}/status.csv/` file) for a given execution date.
    """
    f = read_gcfs(f"schedule/{execution_date}/status.csv")
    status = pd.read_csv(f)

    return status[lambda d: d.status == "success"]
示例#6
0
def _keep_columns(gcs_dirs,
                  dst_dir,
                  filename,
                  required_cols,
                  optional_cols,
                  prepend_ids=True):
    for path in gcs_dirs:
        full_src_path = f"{path}/{filename}"
        full_dst_path = f"{path}/{dst_dir}/{filename}"

        final_header = [*required_cols, *optional_cols]

        # read csv using object dtype, so pandas does not coerce data
        df = pd.read_csv(read_gcfs(full_src_path), dtype="object")

        # preprocess data to include cal-itp id columns ---
        # column names: calitp_id, calitp_url_number
        if prepend_ids:
            # hacky, but parse /path/.../{itp_id}/{url_number}
            basename = path.split("/")[-1]
            itp_id, url_number = map(int, basename.split("_"))

            df = df.assign(calitp_itp_id=itp_id, calitp_url_number=url_number)

        # get specified columns, inserting NA columns where needed ----
        df_cols = set(df.columns)
        opt_cols_present = [x for x in optional_cols if x in df_cols]

        df_select = df[[*required_cols, *opt_cols_present]]

        # fill in missing columns ----
        for ii, colname in enumerate(final_header):
            if colname not in df_select:
                print("INSERTING MISSING COLUMN")
                df_select.insert(ii, colname, pd.NA)
            print("SHAPE: ", df_select.shape)

        # save result ----
        csv_result = df_select

        encoded = csv_result.to_csv(index=False).encode()
        save_to_gcfs(encoded, full_dst_path, use_pipe=True)
示例#7
0
def _keep_columns(
    src_path,
    dst_path,
    colnames,
    itp_id=None,
    url_number=None,
    extracted_at=None,
    **kwargs,
):
    """Save a CSV file with only the needed columns for a particular table.

    Args:
        src_path (string): Location of the input CSV file
        dst_path (string): Location of the output CSV file
        colnames (list): List of the colnames that should be included in output CSV
            file.
        itp_id (string, optional): itp_id to use when saving record. Defaults to None.
        url_number (string, optional): url_number to use when saving record. Defaults to
            None.
        extracted_at (string, optional): date string of extraction time. Defaults to
            None.

    Raises:
        pandas.errors.ParserError: Can be thrown when the given input file is not a
            valid CSV file. Ex: a single row could have too many columns.
    """

    # Read csv using object dtype, so pandas does not coerce data.
    # The following line of code inside the try block can throw a
    # pandas.errors.ParserError, but the responsibility to catch this error is assumed
    # to be implemented in the code that calls this method.
    try:
        df = pd.read_csv(
            read_gcfs(src_path), dtype="object", encoding_errors="replace", **kwargs
        )
    except EmptyDataError:
        # in the rare case of a totally empty data file, create a DataFrame
        # with no rows, and the target columns
        df = pd.DataFrame({k: [] for k in colnames})

    if itp_id is not None:
        df["calitp_itp_id"] = itp_id

    if url_number is not None:
        df["calitp_url_number"] = url_number

    # get specified columns, inserting NA columns where needed ----
    df_cols = set(df.columns)
    cols_present = [x for x in colnames if x in df_cols]

    df_select = df.loc[:, cols_present]

    # fill in missing columns ----
    print("DataFrame missing columns: ", set(df_select.columns) - set(colnames))

    for ii, colname in enumerate(colnames):
        if colname not in df_select:
            df_select.insert(ii, colname, pd.NA)

    if extracted_at is not None:
        df_select["calitp_extracted_at"] = extracted_at

    # save result ----
    csv_result = df_select.to_csv(index=False).encode()

    save_to_gcfs(csv_result, dst_path, use_pipe=True)