def test_compare(): df_1 = pd.DataFrame( [("99", "2020-04-01", None, 1, 3), ("99", "2020-04-02", 1.1, 2.2, 3.3)], columns="fips date metric_a metric_b only_1".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) df_2 = pd.DataFrame( [("99", "2020-04-01", 1, 2, 3), ("99", "2020-04-02", 1.1, None, 3.3)], columns="fips date metric_a metric_b only_2".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) differ1.compare(differ2) assert differ1.my_ts.to_list() == [("only_1", "99")] assert differ2.my_ts.to_list() == [("only_2", "99")] assert differ1.my_ts_points.index.to_list() == [ ("metric_b", "99", pd.Timestamp("2020-04-02")) ] assert differ2.my_ts_points.index.to_list() == [ ("metric_a", "99", pd.Timestamp("2020-04-01")) ] assert differ1.ts_diffs.to_dict(orient="index") == { ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=1), ("metric_b", "99"): dict(diff=1 / 3, has_overlap=True, points_overlap=1), }
def test_compare(): metric_a = FieldName("metric_a") metric_b = FieldName("metric_b") columns = [CommonFields.LOCATION_ID, CommonFields.DATE, metric_a, metric_b] df_1 = pd.DataFrame( [("99", "2020-04-01", None, 1, 3), ("99", "2020-04-02", 1.1, 2.2, 3.3)], columns=columns + ["only_1"], ).set_index(common_df_diff.TIMESERIES_KEYS) df_2 = pd.DataFrame( [("99", "2020-04-01", 1, 2, 3), ("99", "2020-04-02", 1.1, None, 3.3)], columns=columns + ["only_2"], ).set_index(common_df_diff.TIMESERIES_KEYS) differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) differ1.compare(differ2) assert differ1.my_ts.to_list() == [("only_1", "99")] assert differ2.my_ts.to_list() == [("only_2", "99")] assert differ1.my_ts_points.index.to_list() == [(metric_b, "99", pd.Timestamp("2020-04-02"))] assert differ2.my_ts_points.index.to_list() == [(metric_a, "99", pd.Timestamp("2020-04-01"))] assert differ1.ts_diffs.to_dict(orient="index") == { (metric_a, "99"): dict(diff=0, has_overlap=True, points_overlap=1), (metric_b, "99"): dict(diff=1 / 3, has_overlap=True, points_overlap=1), }
def test_drop_duplicates(): df_1 = pd.DataFrame( [("99", "2020-04-01", 1), ("99", "2020-04-01", 1.1), ("99", "2020-04-03", 3)], columns="fips date metric_a".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) df_2 = pd.DataFrame( [("99", "2020-04-02", 2), ("99", "2020-04-03", 3)], columns="fips date metric_a".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) assert list(differ1.duplicates_dropped.itertuples()) == [ (("99", "2020-04-01"), 1.0), (("99", "2020-04-01"), 1.1), ] differ1.compare(differ2) assert differ1.my_ts.to_list() == [] assert differ2.my_ts.to_list() == [] assert differ1.ts_diffs.to_dict(orient="index") == { ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=1), }
def test_drop_duplicates(): metric_a = FieldName("metric_a") columns = [CommonFields.LOCATION_ID, CommonFields.DATE, metric_a] df_1 = pd.DataFrame( [("99", "2020-04-01", 1), ("99", "2020-04-01", 1.1), ("99", "2020-04-03", 3)], columns=columns, ).set_index(common_df_diff.TIMESERIES_KEYS) df_2 = pd.DataFrame( [("99", "2020-04-02", 2), ("99", "2020-04-03", 3)], columns=columns ).set_index(common_df_diff.TIMESERIES_KEYS) differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) assert list(differ1.duplicates_dropped.itertuples()) == [ (("99", "2020-04-01"), 1.0), (("99", "2020-04-01"), 1.1), ] differ1.compare(differ2) assert differ1.my_ts.to_list() == [] assert differ2.my_ts.to_list() == [] assert differ1.ts_diffs.to_dict(orient="index") == { (metric_a, "99"): dict(diff=0, has_overlap=True, points_overlap=1), }
def test_persist_and_load_dataset(tmp_path, nyc_fips): dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_subset(None, fips=nyc_fips) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def csv_diff(csv_path_left, csv_path_right): """Compare 2 CSV files.""" df_l = common_df.read_csv(csv_path_left) df_r = common_df.read_csv(csv_path_right) differ_l = DatasetDiff.make(df_l) differ_r = DatasetDiff.make(df_r) differ_l.compare(differ_r) print(f"File: {csv_path_left}") print(differ_l) print(f"File: {csv_path_right}") print(differ_r)
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_regions_subset([region]) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer) differ_l = DatasetDiff.make(downloaded_dataset.timeseries) differ_r = DatasetDiff.make(timeseries_nyc.timeseries) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def test_zero_value(): location_id = pipeline.Region.from_state("TX").location_id metric_a = FieldName("metric_a") df_1 = pd.DataFrame( [(location_id, "2020-04-01", 0), (location_id, "2020-04-02", 0)], columns=[CommonFields.LOCATION_ID, CommonFields.DATE, metric_a], ).set_index(common_df_diff.TIMESERIES_KEYS) df_2 = df_1.copy() differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) differ1.compare(differ2) assert differ1.my_ts.to_list() == [] assert differ2.my_ts.to_list() == [] assert differ1.ts_diffs.to_dict(orient="index") == { (metric_a, location_id): dict(diff=0, has_overlap=True, points_overlap=2), }
def test_zero_value(): df_1 = pd.DataFrame( [("99", "2020-04-01", 0), ("99", "2020-04-02", 0)], columns="fips date metric_a".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) df_2 = pd.DataFrame( [("99", "2020-04-01", 0), ("99", "2020-04-02", 0)], columns="fips date metric_a".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) differ1 = DatasetDiff.make(df_1) differ2 = DatasetDiff.make(df_2) differ1.compare(differ2) assert differ1.my_ts.to_list() == [] assert differ2.my_ts.to_list() == [] assert differ1.ts_diffs.to_dict(orient="index") == { ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=2), }
def csv_diff(csv_path_or_rev_left, csv_path_right): """Compare 2 CSV files.""" left_path = pathlib.Path(csv_path_or_rev_left) right_path = pathlib.Path(csv_path_right) if left_path.exists(): left_data = left_path.read_bytes() else: repo = git.Repo(dataset_utils.REPO_ROOT) left_data = read_data_for_commit(repo, right_path, repo.commit(csv_path_or_rev_left)) df_l = common_df.read_csv(BytesIO(left_data)) df_r = common_df.read_csv(csv_path_right) differ_l = DatasetDiff.make(df_l) differ_r = DatasetDiff.make(df_r) differ_l.compare(differ_r) print(f"File: {csv_path_or_rev_left}") print(differ_l) print(f"File: {csv_path_right}") print(differ_r)