示例#1
0
def test_get_dir(tmp_path):
    # note: tmp_path uses pathlib, prefer it to tmpdir
    root, project, kind, source = "data", "pertinent", "external", "peerless"
    p = tmp_path.joinpath(root, project, kind, source)
    p.mkdir(parents=True, exist_ok=True)

    assert p.is_dir()
    actual = utils.get_dir(tmp_path.joinpath(root), project, kind, source)
    assert actual.is_dir()
示例#2
0
def load_glc_codes():
    """Returns the Geographic Locator Codes (GLCs) for the U.S."""

    codes_path = (utils.get_dir(
        os.getenv("DATA_ROOT"),
        os.getenv("PROJECT_KEY"),
        "external",
        "FRPP_GLC",
    ) / "FRPP_GLC_United_States.csv")
    print(f"codes path = {codes_path}", file=sys.stderr)
    return SparkSession.builder.getOrCreate().read.csv(str(codes_path),
                                                       inferSchema=True,
                                                       header=True)
示例#3
0
def main():
    """Extracts, transforms and loads the traffic accident data."""

    cmd = get_command()
    if cmd == "Q":
        print("\nExiting")
        sys.exit(0)

    utils.load_env()
    root, project = (os.getenv("DATA_ROOT"), os.getenv("PROJECT_KEY"))
    spark = utils.create_spark_session()
    fars_data_path = utils.get_dir(root, project, "external",
                                   os.getenv("FARS_KEY"))
    make_filenames_case_consistent(fars_data_path)

    if cmd == "A":
        print(f"\nRunning FARS Accident Pipeline\n")
        accident_pipeline(root, project)

    elif cmd == "P":
        print(f"\nRunning FARS Person Pipeline\n")
        person_pipeline(root, project)
示例#4
0
def person_pipeline(root, project):
    """Run the person-level data pipeline."""
    print("Running the Person-level pipeline")
    fars_data_path = utils.get_dir(root, project, "external",
                                   os.getenv("FARS_KEY"))
    print(f"\nRunning locally using FARS data from {fars_data_path}\n")

    files = [
        "ACCIDENT.CSV",
        "PERSON.CSV",
        # "VEHICLE.CSV",
        # "PBTYPE.CSV",
        # "VIOLATN.CSV",
        # "NMCRASH.CSV",
    ]

    yr_dir = build_dir_year_dict(find_dirs_containing(files, fars_data_path))
    dfs_year_file = defaultdict(lambda: defaultdict(dict))
    for _dir, year in yr_dir.items():
        for _file in files:
            df = read_csv(_dir, _file)
            df = df.toDF(*fix_spaces_in_column_names(df.columns))
            dfs_year_file[year][_file] = df

    files_found = reduce(
        set.intersection,
        [set(dfs_year_file[y].keys()) for y in sorted(dfs_year_file.keys())],
    )

    all_years_by_file, common_cols = {}, {}
    for _file in files_found:
        # get all dataframes for a given datafile
        dfs = [dfs_year_file[yr][_file] for yr in dfs_year_file.keys()]
        common_cols[_file] = find_common_set_of_column_names(dfs)
        # select common columns in all dataframes
        dfs = [df.select(*common_cols[_file]) for df in dfs]
        # union the dataframes together by column name
        all_years_by_file[_file] = reduce(DataFrame.unionByName, dfs)

    # prepare to join by removing duplicate column names
    dup_cols = reduce(set.intersection,
                      map(set, [common_cols[f] for f in files_found]))
    JOIN_ON_COLUMN = "ST_CASE"
    dup_cols.discard(JOIN_ON_COLUMN)
    keep_cols = set(all_years_by_file["ACCIDENT.CSV"].columns) - set(dup_cols)
    all_years_by_file["ACCIDENT.CSV"] = all_years_by_file[
        "ACCIDENT.CSV"].select(*keep_cols)

    # join files
    person_df = all_years_by_file["ACCIDENT.CSV"].join(
        all_years_by_file["PERSON.CSV"], on=[JOIN_ON_COLUMN])

    # extract ped and cyclists (i.e., people not in vehicles)
    # Non-Occupants are identified by vehicle number 0 and are
    # numbered consecutively starting with 1 for each non-motorist.
    ped_cyclist_df = person_df.filter(person_df.VEH_NO == 0)

    # data quality check
    assert (ped_cyclist_df.count() <
            person_df.count()), "ped_cyclist_df contains too many records"
    assert ped_cyclist_df.count() > 0, "ped_cyclist_df dataframe is empty!"

    # save resulting dataframe for analysis
    output_path = str(
        utils.get_dir(root, project, "interim",
                      "FARS").joinpath("ped_cyclist_df.csv"))
    print(f"\nWriting data to folder {output_path}")
    ped_cyclist_df.write.csv(output_path, mode="overwrite", header=True)
示例#5
0
def accident_pipeline(root, project):
    """Run the accident pipeline and extract Denver and Seattle specifics."""

    # loop over directories with accident.csv and acc_aux.csv files
    fars_data_path = utils.get_dir(root, project, "external",
                                   os.getenv("FARS_KEY"))
    print(f"\nRunning locally using FARS data from {fars_data_path}\n")

    dir_yr_dict = build_dir_year_dict(
        find_dirs_containing(["ACCIDENT.CSV", "ACC_AUX.CSV"], fars_data_path))
    count = 1
    accident_dfs, acc_aux_dfs, acc_dfs = [], [], []

    # join each pair together
    print("\nProcessing Directories")
    # TODO use map reduce rather than looping
    for _dir, year in dir_yr_dict.items():
        # read in csv data and keep columns common to all years
        accident_df = utils.read_csv(
            Path(_dir).joinpath("ACCIDENT.CSV")).select(
                "ST_CASE",
                "CITY",
                "MONTH",
                "DAY",
                "HOUR",
                "MINUTE",
                "DAY_WEEK",
                "LGT_COND",
            )
        # fix minor year to year differences in column naming
        accident_df = accident_df.toDF(
            *fix_spaces_in_column_names(accident_df.columns))
        accident_dfs.append(accident_df)

        # data quality check #1
        assert (accident_df.count() >
                0), f"accident_df dataframe from {_dir} is empty!"

        # read in csv and only keep columns common to all years
        acc_aux_df = utils.read_csv(Path(_dir).joinpath("ACC_AUX.CSV")).select(
            "ST_CASE",
            "FATALS",
            "YEAR",
            "STATE",
            "COUNTY",
            "A_DROWSY",
            "A_PEDAL",
            "A_ROLL",
            "A_MANCOL",
            "A_TOD",
            "A_CRAINJ",
            "A_RD",
            "A_RELRD",
            "A_LT",
            "A_POSBAC",
            "A_CT",
            "A_INTSEC",
            "A_HR",
            "A_DOW",
            "A_SPCRA",
            "A_REGION",
            "A_INTER",
            "A_DIST",
            "A_JUNC",
            "A_PED",
            "A_POLPUR",
            "A_MC",
            "A_RU",
            "A_ROADFC",
            "A_D15_19",
            "A_D15_20",
            "A_D16_19",
            "A_D16_20",
            "A_D16_24",
            "A_D21_24",
            "A_D65PLS",
        )
        acc_aux_df = acc_aux_df.toDF(
            *fix_spaces_in_column_names(acc_aux_df.columns))
        acc_aux_dfs.append(acc_aux_df)

        # data quality check #2
        assert (acc_aux_df.count() >
                0), f"acc_aux_df dataframe from {_dir} is empty!"

        # join dataframes and drop duplicated columns after merge
        acc_df = accident_df.join(acc_aux_df, on="ST_CASE")
        acc_dfs.append(acc_df)

        print(
            f"{count} dir:{_dir}  rows: {acc_df.count():,}  cols: {len(acc_df.columns):,}"
        )
        count += 1

    try:
        # find columns common to all years
        accident_common_cols = find_common_set_of_column_names(accident_dfs)
        print("Common ACCIDENT.CSV Columns:", accident_common_cols)

        acc_aux_common_cols = find_common_set_of_column_names(acc_aux_dfs)
        print("Common ACC_AUX.CSV Columns:", acc_aux_common_cols)

        # append combined accident files for all years
        all_acc_df = reduce(DataFrame.unionByName, acc_dfs)

    except Exception:
        print(f"Exception while processing: {_dir}")
        print(
            f"Use only common ACCIDENT.CSV columns :\n{accident_common_cols}")
        print(f"Use only common ACC_AUX.CSV columns:\n{acc_aux_common_cols}")

    # show the number of records
    print(
        f"\nNumber of motor vehicle accidents (1982-2018): {all_acc_df.count():,}"
    )

    # data quality check #3
    assert (
        all_acc_df.count() > 0
    ), "Combined accident and acc_aux table (all_acc_df) dataframe is empty!"

    # save resulting dataframe for analysis
    output_path = str(
        utils.get_dir(root, project, "interim",
                      "FARS").joinpath("all_fatal_accidents_1982_to_2018.csv"))
    print(f"output_path={output_path}")
    all_acc_df.write.csv(output_path, mode="overwrite", header=True)