예제 #1
0
def test_write_csv_extra_columns_dropped():
    df = pd.DataFrame(
        [], columns=[CommonFields.DATE, CommonFields.FIPS, "extra1", CommonFields.CASES, "extra2"]
    )
    df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS)
    with temppathlib.NamedTemporaryFile("w+") as tmp, structlog.testing.capture_logs() as logs:
        log = structlog.get_logger()
        common_df.write_csv(only_common_columns(df, log), tmp.path, log)
        assert "fips,date,cases\n" == tmp.file.read()
    assert [l["event"] for l in logs] == [
        "Dropping columns not in CommonFields",
        "Writing DataFrame",
    ]
예제 #2
0
    def to_csv(self, path: pathlib.Path):
        """Save data to CSV.

        Args:
            path: Path to save data to.
        """
        # Cannot use common_df.write_csv as it doesn't support data without a date index field.
        data = self.data.set_index(CommonFields.FIPS).replace({
            pd.NA: np.nan
        }).convert_dtypes()
        data = common_df.only_common_columns(
            data, structlog.get_logger())  # Drops `index`
        data = common_df.sort_common_field_columns(data).sort_index()
        data.to_csv(path,
                    date_format="%Y-%m-%d",
                    index=True,
                    float_format="%.12g")
            *sorted(rename.items(), key=lambda f_c: common_order[f_c[1]]))
        # Copy only columns in `rename.keys()` to a new DataFrame and rename.
        data = data.loc[:, list(names_in)].rename(columns=rename)
        if col_not_in_fields_or_common:
            self.log.warning("Removing columns not in CommonFields",
                             columns=col_not_in_fields_or_common)

        return data


def remove_duplicate_city_data(data):
    # City data before 3-23 was not duplicated, copy the city name to the county field.
    select_pre_march_23 = data.date < "2020-03-23"
    data.loc[select_pre_march_23,
             "county"] = data.loc[select_pre_march_23].apply(
                 fill_missing_county_with_city, axis=1)
    # Don't want to return city data because it's duplicated in county
    return data.loc[select_pre_march_23 |
                    ((~select_pre_march_23) & data["city"].isnull())].copy()


if __name__ == "__main__":
    common_init.configure_logging()
    log = structlog.get_logger()
    transformer = CovidDataScraperTransformer.make_with_data_root(DATA_ROOT)
    write_df_as_csv(
        only_common_columns(transformer.transform(), log),
        DATA_ROOT / "cases-cds" / "timeseries-common.csv",
        log,
    )