예제 #1
0
def test_compare():
    df_1 = pd.DataFrame(
        [("99", "2020-04-01", None, 1, 3),
         ("99", "2020-04-02", 1.1, 2.2, 3.3)],
        columns="fips date metric_a metric_b only_1".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    df_2 = pd.DataFrame(
        [("99", "2020-04-01", 1, 2, 3), ("99", "2020-04-02", 1.1, None, 3.3)],
        columns="fips date metric_a metric_b only_2".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)

    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == [("only_1", "99")]
    assert differ2.my_ts.to_list() == [("only_2", "99")]

    assert differ1.my_ts_points.index.to_list() == [
        ("metric_b", "99", pd.Timestamp("2020-04-02"))
    ]
    assert differ2.my_ts_points.index.to_list() == [
        ("metric_a", "99", pd.Timestamp("2020-04-01"))
    ]
    assert differ1.ts_diffs.to_dict(orient="index") == {
        ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=1),
        ("metric_b", "99"): dict(diff=1 / 3,
                                 has_overlap=True,
                                 points_overlap=1),
    }
예제 #2
0
def test_compare():
    metric_a = FieldName("metric_a")
    metric_b = FieldName("metric_b")
    columns = [CommonFields.LOCATION_ID, CommonFields.DATE, metric_a, metric_b]
    df_1 = pd.DataFrame(
        [("99", "2020-04-01", None, 1, 3), ("99", "2020-04-02", 1.1, 2.2, 3.3)],
        columns=columns + ["only_1"],
    ).set_index(common_df_diff.TIMESERIES_KEYS)

    df_2 = pd.DataFrame(
        [("99", "2020-04-01", 1, 2, 3), ("99", "2020-04-02", 1.1, None, 3.3)],
        columns=columns + ["only_2"],
    ).set_index(common_df_diff.TIMESERIES_KEYS)

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)

    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == [("only_1", "99")]
    assert differ2.my_ts.to_list() == [("only_2", "99")]

    assert differ1.my_ts_points.index.to_list() == [(metric_b, "99", pd.Timestamp("2020-04-02"))]
    assert differ2.my_ts_points.index.to_list() == [(metric_a, "99", pd.Timestamp("2020-04-01"))]
    assert differ1.ts_diffs.to_dict(orient="index") == {
        (metric_a, "99"): dict(diff=0, has_overlap=True, points_overlap=1),
        (metric_b, "99"): dict(diff=1 / 3, has_overlap=True, points_overlap=1),
    }
예제 #3
0
def test_drop_duplicates():
    df_1 = pd.DataFrame(
        [("99", "2020-04-01", 1), ("99", "2020-04-01", 1.1),
         ("99", "2020-04-03", 3)],
        columns="fips date metric_a".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    df_2 = pd.DataFrame(
        [("99", "2020-04-02", 2), ("99", "2020-04-03", 3)],
        columns="fips date metric_a".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)

    assert list(differ1.duplicates_dropped.itertuples()) == [
        (("99", "2020-04-01"), 1.0),
        (("99", "2020-04-01"), 1.1),
    ]

    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == []
    assert differ2.my_ts.to_list() == []

    assert differ1.ts_diffs.to_dict(orient="index") == {
        ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=1),
    }
예제 #4
0
def test_drop_duplicates():
    metric_a = FieldName("metric_a")
    columns = [CommonFields.LOCATION_ID, CommonFields.DATE, metric_a]
    df_1 = pd.DataFrame(
        [("99", "2020-04-01", 1), ("99", "2020-04-01", 1.1), ("99", "2020-04-03", 3)],
        columns=columns,
    ).set_index(common_df_diff.TIMESERIES_KEYS)

    df_2 = pd.DataFrame(
        [("99", "2020-04-02", 2), ("99", "2020-04-03", 3)], columns=columns
    ).set_index(common_df_diff.TIMESERIES_KEYS)

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)

    assert list(differ1.duplicates_dropped.itertuples()) == [
        (("99", "2020-04-01"), 1.0),
        (("99", "2020-04-01"), 1.1),
    ]

    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == []
    assert differ2.my_ts.to_list() == []

    assert differ1.ts_diffs.to_dict(orient="index") == {
        (metric_a, "99"): dict(diff=0, has_overlap=True, points_overlap=1),
    }
예제 #5
0
def test_persist_and_load_dataset(tmp_path, nyc_fips):
    dataset = combined_datasets.load_us_timeseries_dataset()
    timeseries_nyc = dataset.get_subset(None, fips=nyc_fips)

    pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path)

    downloaded_dataset = pointer.load_dataset()
    differ_l = DatasetDiff.make(downloaded_dataset.data)
    differ_r = DatasetDiff.make(timeseries_nyc.data)
    differ_l.compare(differ_r)

    assert not len(differ_l.my_ts)
예제 #6
0
def csv_diff(csv_path_left, csv_path_right):
    """Compare 2 CSV files."""
    df_l = common_df.read_csv(csv_path_left)
    df_r = common_df.read_csv(csv_path_right)

    differ_l = DatasetDiff.make(df_l)
    differ_r = DatasetDiff.make(df_r)
    differ_l.compare(differ_r)

    print(f"File: {csv_path_left}")
    print(differ_l)
    print(f"File: {csv_path_right}")
    print(differ_r)
def test_persist_and_load_dataset(tmp_path, nyc_fips):
    region = Region.from_fips(nyc_fips)
    dataset = combined_datasets.load_us_timeseries_dataset()
    timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data)

    pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path)

    downloaded_dataset = pointer.load_dataset()
    differ_l = DatasetDiff.make(downloaded_dataset.data)
    differ_r = DatasetDiff.make(timeseries_nyc.data)
    differ_l.compare(differ_r)

    assert not len(differ_l.my_ts)
def test_persist_and_load_dataset(tmp_path, nyc_fips):
    region = Region.from_fips(nyc_fips)
    dataset = combined_datasets.load_us_timeseries_dataset()
    timeseries_nyc = dataset.get_regions_subset([region])

    pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path)

    downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer)
    differ_l = DatasetDiff.make(downloaded_dataset.timeseries)
    differ_r = DatasetDiff.make(timeseries_nyc.timeseries)
    differ_l.compare(differ_r)

    assert not len(differ_l.my_ts)
예제 #9
0
def test_zero_value():
    location_id = pipeline.Region.from_state("TX").location_id
    metric_a = FieldName("metric_a")
    df_1 = pd.DataFrame(
        [(location_id, "2020-04-01", 0), (location_id, "2020-04-02", 0)],
        columns=[CommonFields.LOCATION_ID, CommonFields.DATE, metric_a],
    ).set_index(common_df_diff.TIMESERIES_KEYS)

    df_2 = df_1.copy()

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)
    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == []
    assert differ2.my_ts.to_list() == []

    assert differ1.ts_diffs.to_dict(orient="index") == {
        (metric_a, location_id): dict(diff=0, has_overlap=True, points_overlap=2),
    }
예제 #10
0
def test_zero_value():
    df_1 = pd.DataFrame(
        [("99", "2020-04-01", 0), ("99", "2020-04-02", 0)],
        columns="fips date metric_a".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    df_2 = pd.DataFrame(
        [("99", "2020-04-01", 0), ("99", "2020-04-02", 0)],
        columns="fips date metric_a".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    differ1 = DatasetDiff.make(df_1)
    differ2 = DatasetDiff.make(df_2)
    differ1.compare(differ2)

    assert differ1.my_ts.to_list() == []
    assert differ2.my_ts.to_list() == []

    assert differ1.ts_diffs.to_dict(orient="index") == {
        ("metric_a", "99"): dict(diff=0, has_overlap=True, points_overlap=2),
    }
예제 #11
0
def csv_diff(csv_path_or_rev_left, csv_path_right):
    """Compare 2 CSV files."""
    left_path = pathlib.Path(csv_path_or_rev_left)
    right_path = pathlib.Path(csv_path_right)

    if left_path.exists():
        left_data = left_path.read_bytes()
    else:
        repo = git.Repo(dataset_utils.REPO_ROOT)
        left_data = read_data_for_commit(repo, right_path,
                                         repo.commit(csv_path_or_rev_left))

    df_l = common_df.read_csv(BytesIO(left_data))
    df_r = common_df.read_csv(csv_path_right)

    differ_l = DatasetDiff.make(df_l)
    differ_r = DatasetDiff.make(df_r)
    differ_l.compare(differ_r)

    print(f"File: {csv_path_or_rev_left}")
    print(differ_l)
    print(f"File: {csv_path_right}")
    print(differ_r)