Пример #1
0
    def test_allequal(self):
        data = pd.DataFrame()
        data["c1"] = [chr(0) for _ in range(100)]
        data["c2"] = [1 for _ in range(100)]
        data["c3"] = [0.7 for _ in range(100)]
        df = self.spark.createDataFrame(data)

        r = entropy(0, df)[0]
        self.assertEqual(r, 0.)
        r = entropy(1, df)[0]
        self.assertEqual(r, 0.)
        r = entropy(2, df)[0]
        self.assertEqual(r, 0.)
Пример #2
0
    def test_allnull(self):
        data = pd.DataFrame()
        data["c1"] = [chr(i) for i in range(100)]
        data["c2"] = [i for i in range(100)]
        data["c3"] = [i / 0.7 for i in range(100)]
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_every_string_with_null(df["c1"]))
        df = df.withColumn("c2", replace_every_int_with_null(df["c2"]))
        df = df.withColumn("c3", replace_every_float_with_null(df["c3"]))

        r = entropy(0, df)[0]
        self.assertEqual(r, 0.)
        r = entropy(1, df)[0]
        self.assertEqual(r, 0.)
        r = entropy(2, df)[0]
        self.assertEqual(r, 0.)
Пример #3
0
    def test_halfhalf(self):
        data = pd.DataFrame()
        c1 = [chr(1) for _ in range(50)]
        c2 = [2 for _ in range(50)]
        c3 = [0.7 for _ in range(50)]
        c1.extend(["zz" for _ in range(50)])
        c2.extend([100 for _ in range(50)])
        c3.extend([32. for _ in range(50)])
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)

        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, 1., delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, 1., delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, 1., delta=0.000001)
Пример #4
0
    def test_empty(self):
        data = pd.DataFrame()
        data["c1"] = []
        data["c2"] = []
        schema = [
            StructField("c1", IntegerType(), True),
            StructField("c2", StringType(), True)
        ]
        df = self.spark.createDataFrame(data, StructType(schema))

        r1 = entropy(0, df)[0]
        self.assertEqual(r1, 0.)
Пример #5
0
    def test_halfnull_halfequal(self):
        data = pd.DataFrame()
        c1 = [chr(1) for _ in range(50)]
        c2 = [2 for _ in range(50)]
        c3 = [0.7 for _ in range(50)]
        c1.extend(["" for _ in range(50)])
        c2.extend([0 for _ in range(50)])
        c3.extend([0. for _ in range(50)])
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", replace_0_with_null(df["c2"]))
        df = df.withColumn("c3", replace_0dot_with_null(df["c3"]))

        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, 0., delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, 0., delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, 0., delta=0.000001)
Пример #6
0
    def test_mixed(self):
        data = pd.DataFrame()
        c1 = [chr(i) for i in range(10)]
        c2 = [i for i in range(1, 11)]
        c3 = [i / 0.7 for i in range(1, 11)]
        c1.extend(["swwewww" for _ in range(20)])
        c2.extend([5000 for _ in range(20)])
        c3.extend([231321.23131 for _ in range(20)])
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)

        res = 2.025605199016944
        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)

        c1.extend(["" for _ in range(5)])
        c2.extend([0 for _ in range(5)])
        c3.extend([0. for _ in range(5)])
        data = pd.DataFrame()
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", replace_0_with_null(df["c2"]))
        df = df.withColumn("c3", replace_0dot_with_null(df["c3"]))

        res = 2.025605199016944
        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
Пример #7
0
    def test_alldifferent(self):
        data = pd.DataFrame()
        c1 = [chr(i) for i in range(100)]
        c2 = [i for i in range(100)]
        c3 = [i / 0.7 for i in range(100)]
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)

        res = 6.6438561897747395
        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)

        for i in range(10):
            c1[i] = ""
            c2[i] = 0
            c3[i] = 0.
        data["c1"] = c1
        data["c2"] = c2
        data["c3"] = c3
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", replace_0_with_null(df["c2"]))
        df = df.withColumn("c3", replace_0dot_with_null(df["c3"]))

        res = 6.491853096329675
        r = entropy(0, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(1, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
        r = entropy(2, df)[0]
        self.assertAlmostEqual(r, res, delta=0.000001)
Пример #8
0
#!/usr/bin/python3
from pyspark.sql import SparkSession

from haychecker.dhc.metrics import entropy

spark = SparkSession.builder.appName("entropy_example").getOrCreate()

df = spark.read.format("csv").option(
    "header", "true").load("examples/resources/employees.csv")

df.show()

r1 = entropy("firstName", df)[0]

print("Entropy firstName: {}".format(r1))

task1 = entropy("firstName")
task2 = entropy("salary")
task3 = task1.add(task2)

result = task3.run(df)

r1 = result[0]["scores"][0]
r2 = result[1]["scores"][0]

print("Entropy firstName: {}, entropy salary: {}".format(r1, r2))