def test_get_histograms(spark_co): pytest.age["data"]["name"] = "b'age'" pytest.company["data"]["name"] = "b'company'" pytest.eyesColor["data"]["name"] = "b'eyeColor'" pytest.gender["data"]["name"] = "b'gender'" pytest.isActive["data"]["name"] = "b'isActive'" pytest.latitude["data"]["name"] = "b'latitude'" pytest.longitude["data"]["name"] = "b'longitude'" pytest.transaction["data"]["name"] = "b'transaction'" pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'" pytest.latitude_longitude["data"]["bins:name"] = "unit_func" spark = spark_co spark_df = spark.createDataFrame(pytest.test_df) spark_filler = SparkHistogrammar( features=[ "date", "isActive", "age", "eyeColor", "gender", "company", "latitude", "longitude", ["isActive", "age"], ["latitude", "longitude"], "transaction", ], bin_specs={ "transaction": { 'num': 100, 'low': -2000, 'high': 2000 }, "longitude": { "bin_width": 5.0, "bin_offset": 0.0 }, "latitude": { "bin_width": 5.0, "bin_offset": 0.0 }, }, read_key="input", store_key="output", ) # test get_histograms() function call current_hists = spark_filler.get_histograms(spark_df) # current_hists = make_histograms(spark_df, features, bin_specs) assert current_hists["age"].toJson() == pytest.age assert current_hists["company"].toJson() == pytest.company assert current_hists["eyeColor"].toJson() == pytest.eyesColor assert current_hists["gender"].toJson() == pytest.gender assert current_hists["latitude"].toJson() == pytest.latitude assert current_hists["longitude"].toJson() == pytest.longitude assert current_hists["transaction"].toJson() == pytest.transaction
def test_get_histograms_timestamp(spark_co): from pyspark.sql.functions import to_timestamp spark = spark_co data_date = [ "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-19 00:00:00" ] df = pd.DataFrame(data_date, columns=['dt']) sdf = spark.createDataFrame(df).withColumn( "dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss")) expected = { 'data': { 'binWidth': 2592000000000000.0, 'bins': { '108': 9.0, '109': 1.0 }, 'bins:type': 'Count', 'entries': 10.0, 'name': "b'dt'", 'nanflow': 0.0, 'nanflow:type': 'Count', 'origin': 1.2625632e+18 }, 'type': 'SparselyBin', 'version': '1.0' } filler = SparkHistogrammar(features=['dt']) current_hists = filler.get_histograms(sdf) assert current_hists['dt'].toJson() == expected
def test_get_histograms(spark_co): pytest.age["data"]["name"] = "b'age'" pytest.company["data"]["name"] = "b'company'" pytest.eyesColor["data"]["name"] = "b'eyeColor'" pytest.gender["data"]["name"] = "b'gender'" pytest.isActive["data"]["name"] = "b'isActive'" pytest.latitude["data"]["name"] = "b'latitude'" pytest.longitude["data"]["name"] = "b'longitude'" pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'" pytest.latitude_longitude["data"]["bins:name"] = "unit_func" spark = spark_co spark_df = spark.createDataFrame(pytest.test_df) spark_filler = SparkHistogrammar(features=[ 'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude', 'longitude', ['isActive', 'age'], ['latitude', 'longitude'] ], bin_specs={ 'longitude': { 'bin_width': 5.0, 'bin_offset': 0.0 }, 'latitude': { 'bin_width': 5.0, 'bin_offset': 0.0 } }, read_key='input', store_key='output') # test get_histograms() function call current_hists = spark_filler.get_histograms(spark_df) # current_hists = make_histograms(spark_df, features, bin_specs) assert current_hists['age'].toJson() == pytest.age assert current_hists['company'].toJson() == pytest.company assert current_hists['eyeColor'].toJson() == pytest.eyesColor assert current_hists['gender'].toJson() == pytest.gender assert current_hists['latitude'].toJson() == pytest.latitude assert current_hists['longitude'].toJson() == pytest.longitude
def test_get_histograms_timestamp(spark_co): from pyspark.sql.functions import to_timestamp spark = spark_co data_date = [ "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-19 00:00:00", ] df = pd.DataFrame(data_date, columns=["dt"]) sdf = spark.createDataFrame(df).withColumn( "dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss")) expected = { "data": { "binWidth": 2592000000000000.0, "bins": { "108": 9.0, "109": 1.0 }, "bins:type": "Count", "entries": 10.0, "name": "b'dt'", "nanflow": 0.0, "nanflow:type": "Count", "origin": 1.2625632e18, }, "type": "SparselyBin", "version": "1.0", } filler = SparkHistogrammar(features=["dt"]) current_hists = filler.get_histograms(sdf) assert current_hists["dt"].toJson() == expected