def test_operator_eq(self): df = pd.DataFrame() df["c1"] = [chr(i) for i in range(100)] df["c2"] = [i for i in range(100)] df["c3"] = [float(i) for i in range(100)] condition1 = {"column": "c1", "operator": "eq", "value": chr(50)} condition2 = {"column": "c1", "operator": "eq", "value": chr(50)} conditions = [condition1, condition2] r = rule(conditions, df)[0] self.assertEqual(r, 1.) condition1 = {"column": "c1", "operator": "eq", "value": chr(50)} condition2 = {"column": "c2", "operator": "eq", "value": 50} conditions = [condition1, condition2] r = rule(conditions, df)[0] self.assertEqual(r, 1.) condition1 = {"column": "c1", "operator": "eq", "value": chr(50)} condition2 = {"column": "c2", "operator": "eq", "value": 50} condition3 = {"column": "c3", "operator": "eq", "value": 50.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 1.) condition1 = {"column": "c1", "operator": "eq", "value": chr(50)} condition2 = {"column": "c2", "operator": "eq", "value": 50} condition3 = {"column": "c3", "operator": "eq", "value": 51.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 0.)
def test_operators_mixed(self): df = pd.DataFrame() df["c1"] = [chr(i) for i in range(100)] df["c2"] = [i for i in range(100)] df["c3"] = [float(i) for i in range(100)] condition1 = {"column": "c1", "operator": "eq", "value": chr(50)} condition2 = {"column": "c2", "operator": "lt", "value": 100.} condition3 = {"column": "c3", "operator": "lt", "value": 100.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 1.) condition1 = {"column": "c1", "operator": "eq", "value": chr(100)} condition2 = {"column": "c2", "operator": "lt", "value": 10} condition3 = {"column": "c3", "operator": "lt", "value": 10.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 0.) condition2 = {"column": "c2", "operator": "gt", "value": 22} condition3 = {"column": "c3", "operator": "lt", "value": 31.} conditions = [condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 8.) condition1 = {"column": "c2", "operator": "gt", "value": 10} condition2 = {"column": "c2", "operator": "lt", "value": 100} condition3 = {"column": "c2", "operator": "gt", "value": 0} condition4 = {"column": "c2", "operator": "lt", "value": 50} condition5 = {"column": "c2", "operator": "lt", "value": 40} condition6 = {"column": "c3", "operator": "gt", "value": 20.} condition7 = {"column": "c3", "operator": "lt", "value": 100} condition8 = {"column": "c3", "operator": "gt", "value": 0} condition9 = {"column": "c3", "operator": "lt", "value": 25} condition10 = {"column": "c3", "operator": "lt", "value": 23} conditions = [ condition1, condition2, condition3, condition4, condition5, condition6, condition7, condition8, condition9, condition10 ] r = rule(conditions, df)[0] self.assertEqual(r, 2.) condition1 = {"column": "c2", "operator": "gt", "value": 10} condition2 = {"column": "c2", "operator": "lt", "value": 100} condition3 = {"column": "c2", "operator": "gt", "value": 0} condition4 = {"column": "c2", "operator": "lt", "value": 50} condition5 = {"column": "c2", "operator": "lt", "value": 40} condition6 = {"column": "c3", "operator": "gt", "value": 20.} condition7 = {"column": "c3", "operator": "lt", "value": 100} condition8 = {"column": "c3", "operator": "eq", "value": -1} condition9 = {"column": "c3", "operator": "lt", "value": 25} condition10 = {"column": "c3", "operator": "lt", "value": 23} conditions = [ condition1, condition2, condition3, condition4, condition5, condition6, condition7, condition8, condition9, condition10 ] r = rule(conditions, df)[0] self.assertEqual(r, 0.)
def test_empty(self): df = pd.DataFrame() df["c1"] = [] df["c2"] = [] condition1 = {"column": "c1", "operator": "lt", "value": 1000} condition2 = {"column": "c1", "operator": "gt", "value": 0} conditions = [condition1, condition2] r = rule(conditions, df)[0] self.assertEqual(r, 100.)
def test_operator_lt(self): df = pd.DataFrame() df["c1"] = [chr(i) for i in range(100)] df["c2"] = [i for i in range(100)] df["c3"] = [float(i) for i in range(100)] condition1 = {"column": "c2", "operator": "lt", "value": 0} conditions = [condition1] r = rule(conditions, df)[0] self.assertEqual(r, 0.) condition1 = {"column": "c2", "operator": "lt", "value": 50} condition2 = {"column": "c2", "operator": "lt", "value": 49} condition3 = {"column": "c3", "operator": "lt", "value": 100.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 49.) condition1 = {"column": "c2", "operator": "lt", "value": 500} condition2 = {"column": "c2", "operator": "lt", "value": 10} condition3 = {"column": "c3", "operator": "lt", "value": 10.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 10.) condition1 = {"column": "c2", "operator": "lt", "value": 21} condition2 = {"column": "c2", "operator": "lt", "value": 22} condition3 = {"column": "c3", "operator": "lt", "value": 0.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 0.) condition1 = {"column": "c2", "operator": "lt", "value": 100} condition2 = {"column": "c2", "operator": "lt", "value": 100} condition3 = {"column": "c3", "operator": "lt", "value": 100.} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 100.)
def test_operators_mixed_and_nulls_inconditions(self): df = pd.DataFrame() c1 = [chr(i) for i in range(100)] c2 = [i for i in range(100)] c3 = [float(i) for i in range(100)] for i in range(10): c1[i] = None c2[i] = None c3[i] = np.NaN df["c1"] = c1 df["c2"] = c2 df["c3"] = c3 condition1 = {"column": "c3", "operator": "eq", "value": 10.0} conditions = [condition1] r = rule(conditions, df)[0] self.assertEqual(r, 1.) condition1 = {"column": "c2", "operator": "gt", "value": -1} condition2 = {"column": "c2", "operator": "gt", "value": -1} condition3 = {"column": "c3", "operator": "gt", "value": -1} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 90.) condition1 = {"column": "c2", "operator": "gt", "value": -1} condition2 = {"column": "c2", "operator": "gt", "value": 50} condition3 = {"column": "c3", "operator": "gt", "value": 50} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 49.) condition1 = {"column": "c2", "operator": "lt", "value": 100} condition2 = {"column": "c2", "operator": "gt", "value": 50} condition3 = {"column": "c3", "operator": "gt", "value": 20} conditions = [condition1, condition2, condition3] r = rule(conditions, df)[0] self.assertEqual(r, 49.) condition1 = {"column": "c2", "operator": "lt", "value": 100} condition2 = {"column": "c3", "operator": "lt", "value": 20} conditions = [condition1, condition2] r = rule(conditions, df)[0] self.assertEqual(r, 10.) condition1 = {"column": "c2", "operator": "lt", "value": 100} condition2 = {"column": "c3", "operator": "lt", "value": 10} conditions = [condition1, condition2] r = rule(conditions, df)[0] self.assertEqual(r, 0.)
def test_groping_multile_columns(self): df = pd.DataFrame() c1 = [0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5] c2 = ["a", "a", "b", "b", "c", "c", "d", "d", "d", "d", "a", "a", "a"] c3 = [0.0, 0.0, 0.1, 0.1, 2.2, 2.2, 2.2, 3.1, 3.2, 3.3, 40, 40, 50] c4 = [10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0, 20.0, 10.0] c5 = ["09:10:10" for _ in range(10)] c5.extend(["00:11:10" for _ in range(3)]) df["c1"] = c1 df["c2"] = c2 df["c3"] = c3 df["c4"] = c4 df["c5"] = c5 task = Task() task.add(completeness()) task.add(completeness([0, 1, 2])) task.add(deduplication([0, 1])) task.add(deduplication()) task.add(timeliness(["c5"], value="10:10:10", timeFormat="%S:%M:%H")) task.add(completeness()) condition1 = {"column": "c3", "operator": "lt", "value": 50} condition2 = {"column": "c3", "operator": "gt", "value": 1.0} conditions = [condition1, condition2] task.add(rule(conditions)) condition1 = {"column": "c5", "operator": "eq", "value": "00:11:10"} conditions = [condition1] task.add(rule(conditions)) condition1 = {"column": "c3", "operator": "lt", "value": 50} condition2 = {"column": "c3", "operator": "gt", "value": 1.0} conditions = [condition1, condition2] having1 = {"column": "*", "operator": "gt", "value": 1, "aggregator": "count"} having2 = {"column": "c4", "operator": "eq", "value": 50 / 3, "aggregator": "avg"} havings = [having1, having2] task.add(grouprule([0, "c2"], havings, conditions)) result = task.run(df) # c1 r = result[0]["scores"][0] self.assertEqual(r, 100.) # c2 r1, r2, r3 = result[1]["scores"] self.assertEqual(r1, 100.) self.assertEqual(r2, 100.) self.assertEqual(r3, 100.) # d1 r1, r2 = result[2]["scores"] self.assertEqual(r1, (6 / 13) * 100) self.assertEqual(r2, (4 / 13) * 100) # d2 r = result[3]["scores"][0] self.assertEqual(r, 100.) # t r = result[4]["scores"][0] self.assertEqual(r, (10 / 13) * 100) # c3 r = result[5]["scores"][0] self.assertEqual(r, 100.) # r1 r = result[6]["scores"][0] self.assertEqual(r, (8 / 13) * 100) # r2 r = result[7]["scores"][0] self.assertEqual(r, (3 / 13) * 100) # gr1 r = result[8]["scores"][0] self.assertEqual(r, 25.0)