def test_timestamp_distribution_blue(length):
    colors = (
        pd.read_csv("data/region_data.csv", usecols=["region", "color"])
        .set_index("region")
        .color.to_dict()
    )

    votes = generate_votes(length)
    votes["color"] = votes["region"].apply(lambda x: colors[x])
    blue_votes = votes[votes["color"] == "blue"].copy()
    blue_votes["cnt"] = 1
    blue_votes["hour"] = blue_votes["timestamp"].dt.hour

    expected = pd.DataFrame(
        {
            "hour": list(range(8, 20)),
            "weight": np.concatenate([np.ones(9), 3 * np.ones(3)]) / 18,
        }
    )
    actual = (
        blue_votes.groupby("hour").cnt.agg("count") / blue_votes.shape[0]
    ).reset_index()

    joined = pd.merge(expected, actual, on="hour")
    joined["diff"] = np.abs(joined["weight"] - joined["cnt"])

    assert joined["diff"].max() < 0.05
def test_votes_columns(length):

    data = generate_votes(length)
    assert (length == 0 and data.empty) or list(data.columns) == [
        "timestamp",
        "id",
        "region",
        "vote",
    ]
def test_regions_distribution(length):
    expected = pd.read_csv("data/region_data.csv", usecols=["region", "percent"])
    regions = pd.DataFrame(generate_votes(length)["region"])
    regions["cnt"] = 1
    actual = (regions.groupby("region").agg("count") / length).reset_index()

    joined = pd.merge(expected, actual, on="region")
    assert joined.shape == (51, 3)

    joined["diff"] = np.abs(joined["percent"] - joined["cnt"])
    assert joined["diff"].max() < 0.05
def test_vote_distribution_blue(length):

    colors = (
        pd.read_csv("data/region_data.csv", usecols=["region", "color"])
        .set_index("region")
        .color.to_dict()
    )
    votes = generate_votes(length)
    votes["color"] = votes["region"].apply(lambda x: colors[x])
    blue_votes = votes[votes["color"] == "blue"].copy()
    blue_votes["cnt"] = 1

    expected = pd.DataFrame(
        {"vote": ["yellow", "red", "blue"], "weight": [0.01, 0.47, 0.52]}
    )
    actual = (
        blue_votes.groupby("vote").cnt.agg("count") / blue_votes.shape[0]
    ).reset_index()

    joined = pd.merge(expected, actual, on="vote")
    joined["diff"] = np.abs(joined["weight"] - joined["cnt"])

    assert joined["diff"].max() < 0.05
def test_votes_have_three_colours(length):
    expected = {"yellow", "blue", "red"}
    actual = set(generate_votes(length)["vote"].unique())
    assert expected == actual
def test_all_regions_appear(length):
    expected_regions = set(pd.read_csv("data/region_data.csv").region)
    actual_regions = set(generate_votes(length)["region"].unique())
    assert expected_regions == actual_regions
def test_timestamps_have_hours_within_range(length):
    hours = generate_votes(length)["timestamp"].dt.hour.unique()
    assert all(hour in range(8, 21) for hour in hours)
def test_timestamps_have_constant_date(length):
    dates = list(generate_votes(length)["timestamp"].dt.date.unique())
    assert (length == 0 and not dates) or dates == [pd.Timestamp("2020-12-10")]
def test_ids_have_no_repetitions(length):
    assert generate_votes(length)["id"].drop_duplicates().shape[0] == length
def test_id_lengths(length):
    string_lengths = generate_votes(length)["id"].apply(lambda x: len(x))
    assert all(uid_len == 36 for uid_len in string_lengths)
def test_generated_as_many_votes_as_requested(length):
    votes = generate_votes(length)
    assert (length == 0 and votes.empty) or generate_votes(length).shape == (length, 4)
示例#12
0
def generate_one_file(index: int, n_rows: int, path_prefix: str) -> None:
    path = f"{path_prefix}_{index}.csv"
    tic = time.time()
    generate_votes(n_rows).to_csv(path, index=False)
    tac = time.time()
    print(f"Generated {n_rows} rows in file: {path}\t{tac - tic} seconds")