def test_make_histograms(): features = [ 'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude', 'longitude', ['isActive', 'age'], ['latitude', 'longitude'] ] bin_specs = { 'longitude': { 'bin_width': 5, 'bin_offset': 0 }, 'latitude': { 'bin_width': 5, 'bin_offset': 0 } } current_hists = make_histograms(pytest.test_df, features=features, binning='unit', bin_specs=bin_specs) assert current_hists['age'].toJson() == pytest.age assert current_hists['company'].toJson() == pytest.company assert current_hists['date'].toJson() == pytest.date assert current_hists['eyeColor'].toJson() == pytest.eyesColor assert current_hists['gender'].toJson() == pytest.gender assert current_hists['isActive'].toJson() == pytest.isActive assert current_hists['isActive:age'].toJson() == pytest.isActive_age assert current_hists['latitude'].toJson() == pytest.latitude assert current_hists['longitude'].toJson() == pytest.longitude assert current_hists['latitude:longitude'].toJson( ) == pytest.latitude_longitude
def test_make_histograms_with_time_axis(): hists, features, bin_specs, time_axis, var_dtype = make_histograms( pytest.test_df, time_axis=True, ret_specs=True) assert len(hists) == 20 assert len(features) == 20 assert len(bin_specs) == 20 assert len(var_dtype) == 21 assert time_axis == "date" assert "date:age" in hists h = hists["date:age"] assert h.binWidth == 751582381944448.0 for cols in features: cols = cols.split(":") assert len(cols) == 2 and cols[0] == "date" for f, bs in bin_specs.items(): assert len(bs) == 2 assert "date:age" in bin_specs dateage = bin_specs["date:age"] assert dateage[0]["bin_width"] == 751582381944448.0 assert dateage[1]["bin_width"] == 2.0 assert dateage[1]["bin_offset"] == 9.5 # test get_bin_specs 1 bin_specs = get_bin_specs(hists) assert "date:age" in bin_specs dateage = bin_specs["date:age"] assert dateage[0]["bin_width"] == 751582381944448.0 assert dateage[1]["bin_width"] == 2.0 assert dateage[1]["bin_offset"] == 9.5 # test get_bin_specs 2 bin_specs = get_bin_specs(hists, skip_first_axis=True) assert "age" in bin_specs age = bin_specs["age"] assert age["bin_width"] == 2.0 assert age["bin_offset"] == 9.5 # test get_bin_specs 3 bin_specs = get_bin_specs(hists["date:age"]) assert bin_specs[0]["bin_width"] == 751582381944448.0 assert bin_specs[1]["bin_width"] == 2.0 assert bin_specs[1]["bin_offset"] == 9.5 # test get_bin_specs 4 bin_specs = get_bin_specs(hists["date:age"], skip_first_axis=True) assert bin_specs["bin_width"] == 2.0 assert bin_specs["bin_offset"] == 9.5
def test_make_histograms_with_time_axis(): hists, features, bin_specs, time_axis, var_dtype = \ make_histograms(pytest.test_df, time_axis=True, ret_specs=True) assert len(hists) == 20 assert len(features) == 20 assert len(bin_specs) == 20 assert len(var_dtype) == 21 assert time_axis == 'date' assert 'date:age' in hists h = hists['date:age'] assert h.binWidth == 751582381944448.0 for cols in features: cols = cols.split(':') assert len(cols) == 2 and cols[0] == 'date' for f, bs in bin_specs.items(): assert len(bs) == 2 assert 'date:age' in bin_specs dateage = bin_specs['date:age'] assert dateage[0]['bin_width'] == 751582381944448.0 assert dateage[1]['bin_width'] == 2.0 assert dateage[1]['bin_offset'] == 9.5 # test get_bin_specs 1 bin_specs = get_bin_specs(hists) assert 'date:age' in bin_specs dateage = bin_specs['date:age'] assert dateage[0]['bin_width'] == 751582381944448.0 assert dateage[1]['bin_width'] == 2.0 assert dateage[1]['bin_offset'] == 9.5 # test get_bin_specs 2 bin_specs = get_bin_specs(hists, skip_first_axis=True) assert 'age' in bin_specs age = bin_specs['age'] assert age['bin_width'] == 2.0 assert age['bin_offset'] == 9.5 # test get_bin_specs 3 bin_specs = get_bin_specs(hists['date:age']) assert bin_specs[0]['bin_width'] == 751582381944448.0 assert bin_specs[1]['bin_width'] == 2.0 assert bin_specs[1]['bin_offset'] == 9.5 # test get_bin_specs 4 bin_specs = get_bin_specs(hists['date:age'], skip_first_axis=True) assert bin_specs['bin_width'] == 2.0 assert bin_specs['bin_offset'] == 9.5
def test_make_histograms(): features = [ "date", "isActive", "age", "eyeColor", "gender", "company", "latitude", "longitude", ["isActive", "age"], ["latitude", "longitude"], "transaction", ] bin_specs = { "transaction": { 'num': 100, 'low': -2000, 'high': 2000 }, "longitude": { "bin_width": 5, "bin_offset": 0 }, "latitude": { "bin_width": 5, "bin_offset": 0 }, } current_hists = make_histograms(pytest.test_df, features=features, binning="unit", bin_specs=bin_specs) assert current_hists["age"].toJson() == pytest.age assert current_hists["company"].toJson() == pytest.company assert current_hists["date"].toJson() == pytest.date assert current_hists["eyeColor"].toJson() == pytest.eyesColor assert current_hists["gender"].toJson() == pytest.gender assert current_hists["isActive"].toJson() == pytest.isActive assert current_hists["isActive:age"].toJson() == pytest.isActive_age assert current_hists["latitude"].toJson() == pytest.latitude assert current_hists["longitude"].toJson() == pytest.longitude assert current_hists["latitude:longitude"].toJson( ) == pytest.latitude_longitude assert current_hists["transaction"].toJson() == pytest.transaction
def test_spark_make_histograms(spark_context): pytest.age["data"]["name"] = "b'age'" pytest.company["data"]["name"] = "b'company'" pytest.eyesColor["data"]["name"] = "b'eyeColor'" pytest.gender["data"]["name"] = "b'gender'" pytest.isActive["data"]["name"] = "b'isActive'" pytest.latitude["data"]["name"] = "b'latitude'" pytest.longitude["data"]["name"] = "b'longitude'" pytest.transaction["data"]["name"] = "b'transaction'" pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'" pytest.latitude_longitude["data"]["bins:name"] = "unit_func" spark_df = spark_context.createDataFrame(pytest.test_df) # test make_histograms() function call with spark df current_hists = make_histograms( spark_df, features=[ "date", "isActive", "age", "eyeColor", "gender", "company", "latitude", "longitude", ["isActive", "age"], ["latitude", "longitude"], "transaction", ], bin_specs={ "transaction": {"num": 100, "low": -2000, "high": 2000}, "longitude": {"bin_width": 5.0, "bin_offset": 0.0}, "latitude": {"bin_width": 5.0, "bin_offset": 0.0}, }, binning="unit", ) assert current_hists["age"].toJson() == pytest.age assert current_hists["company"].toJson() == pytest.company assert current_hists["eyeColor"].toJson() == pytest.eyesColor assert current_hists["gender"].toJson() == pytest.gender assert current_hists["latitude"].toJson() == pytest.latitude assert current_hists["longitude"].toJson() == pytest.longitude assert current_hists["transaction"].toJson() == pytest.transaction
def test_make_histograms_unit_binning(): hists, features, bin_specs, time_axis, var_dtype = make_histograms( pytest.test_df, binning="unit", time_axis="", ret_specs=True) assert len(hists) == 21 assert len(features) == 21 assert len(bin_specs) == 0 assert len(var_dtype) == 21 assert time_axis == "" assert "date" in hists h = hists["date"] assert h.binWidth == 2592000000000000 for cols in features: cols = cols.split(":") assert len(cols) == 1 for f, bs in bin_specs.items(): assert isinstance(bs, dict) assert "age" in hists h = hists["age"] assert h.binWidth == 1.0 assert h.origin == 0.0
def test_make_histograms_no_time_axis(): hists, features, bin_specs, time_axis, var_dtype = make_histograms( pytest.test_df, time_axis="", ret_specs=True) assert len(hists) == 21 assert len(features) == 21 assert len(bin_specs) == 6 assert len(var_dtype) == 21 assert time_axis == "" assert "date" in hists h = hists["date"] assert h.binWidth == 751582381944448.0 for cols in features: cols = cols.split(":") assert len(cols) == 1 for f, bs in bin_specs.items(): assert isinstance(bs, dict) assert "age" in bin_specs dateage = bin_specs["age"] assert dateage["bin_width"] == 2.0 assert dateage["bin_offset"] == 9.5
def test_make_histograms_no_time_axis(): hists, features, bin_specs, time_axis, var_dtype = \ make_histograms(pytest.test_df, time_axis='', ret_specs=True) assert len(hists) == 21 assert len(features) == 21 assert len(bin_specs) == 6 assert len(var_dtype) == 21 assert time_axis == '' assert 'date' in hists h = hists['date'] assert h.binWidth == 751582381944448.0 for cols in features: cols = cols.split(':') assert len(cols) == 1 for f, bs in bin_specs.items(): assert isinstance(bs, dict) assert 'age' in bin_specs dateage = bin_specs['age'] assert dateage['bin_width'] == 2.0 assert dateage['bin_offset'] == 9.5
def test_make_histograms_unit_binning(): hists, features, bin_specs, time_axis, var_dtype = \ make_histograms(pytest.test_df, binning='unit', time_axis='', ret_specs=True) assert len(hists) == 21 assert len(features) == 21 assert len(bin_specs) == 0 assert len(var_dtype) == 21 assert time_axis == '' assert 'date' in hists h = hists['date'] assert h.binWidth == 2592000000000000 for cols in features: cols = cols.split(':') assert len(cols) == 1 for f, bs in bin_specs.items(): assert isinstance(bs, dict) assert 'age' in hists h = hists['age'] assert h.binWidth == 1.0 assert h.origin == 0.0
def test_histogram_stitching(): features1 = sorted(["date:isActive", "date:eyeColor", "date:latitude"]) features2 = sorted(["isActive", "eyeColor", "latitude", "age"]) hists1 = make_histograms(pytest.test_df, features=features1) bs = get_bin_specs(hists1, skip_first_axis=True) hists2 = make_histograms(pytest.test_df, features=features2, bin_specs=bs) # add 'date' axis to hists2 (ts=50) and stitch with hists1 hists3 = stitch_histograms( hists_basis=hists1, hists_delta=hists2, time_axis="date", time_bin_idx=[50] ) np.testing.assert_array_equal(sorted(hists3.keys()), features1) assert hists3["date:isActive"].entries == 800 assert hists3["date:isActive"].bins[50].entries == 400 # add 'date' axis to hists2 (ts=50) and hists2 (ts=51) and stitch hists3 = stitch_histograms( hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=[50, 51] ) np.testing.assert_array_equal( sorted(hists3.keys()), sorted(features1 + ["date:age"]) ) assert hists3["date:age"].entries == 800 assert hists3["date:age"].bins[50].entries == 400 assert hists3["date:age"].bins[51].entries == 400 # add 'date' axis to hists2 and hists2 and stitch at auto-bins 0, 1 hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2, time_axis="date") np.testing.assert_array_equal( sorted(hists3.keys()), sorted(features1 + ["date:age"]) ) assert hists3["date:age"].entries == 800 assert 0 in hists3["date:age"].bins assert 1 in hists3["date:age"].bins # add 'date' axis to hists2 and hists2 and stitch at bin 50 and auto-bins 51 hists3 = stitch_histograms( hists_basis=hists2, hists_delta=hists2, time_axis="date", time_bin_idx=50 ) np.testing.assert_array_equal( sorted(hists3.keys()), sorted(features1 + ["date:age"]) ) assert hists3["date:age"].entries == 800 assert 51 in hists3["date:age"].bins # no stitching b/c no overlap, returns hists1 hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists2) np.testing.assert_array_equal(sorted(hists3.keys()), features1) assert hists3["date:latitude"].entries == 400 assert 50 not in hists3["date:latitude"].bins # add hists1 to hists1 hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1) np.testing.assert_array_equal(sorted(hists3.keys()), features1) assert hists3["date:latitude"].entries == 800 assert 50 not in hists3["date:latitude"].bins # add hists2 to hists2 hists3 = stitch_histograms(hists_basis=hists2, hists_delta=hists2) np.testing.assert_array_equal(sorted(hists3.keys()), features2) assert hists3["age"].entries == 800 # add hists1 to hists1 hists3 = stitch_histograms(hists_basis=hists1, hists_delta=hists1, mode="replace") np.testing.assert_array_equal(sorted(hists3.keys()), features1) assert hists3["date:latitude"].entries == 400 # add 'date' axis to hists2 (ts=50) and stitch with hists1 hists3 = stitch_histograms( hists_basis=hists1, hists_delta=hists2, time_axis="date", time_bin_idx=[1], mode="replace", ) np.testing.assert_array_equal(sorted(hists3.keys()), features1) assert hists3["date:isActive"].bins[1].entries == 400 assert hists3["date:isActive"].entries == 777