def testSimple(self): if not self.canRun: return pdf = PanDatFactory(**netflowSchema()) _dat = netflowPandasData() dat = pdf.PanDat(**{t:getattr(_dat, t) for t in pdf.all_tables}) self.assertTrue(pdf.good_pan_dat_object(dat)) dat2 = pdf.copy_pan_dat(dat) self.assertTrue(pdf._same_data(dat, dat2)) self.assertTrue(pdf.good_pan_dat_object(dat2)) delattr(dat2, "nodes") msg = [] self.assertFalse(pdf.good_pan_dat_object(dat2, msg.append)) self.assertTrue(msg[-1] == "nodes not an attribute.") dat3 = pdf.copy_pan_dat(dat) dat3.cost.drop("commodity", axis=1, inplace=True) self.assertFalse(pdf.good_pan_dat_object(dat3, msg.append)) self.assertTrue("The following are (table, field) pairs missing from the data" in msg[-1]) dat4 = pdf.copy_pan_dat(dat) dat4.cost["cost"] += 1 self.assertFalse(pdf._same_data(dat, dat4)) pdf2 = PanDatFactory(**{t:'*' for t in pdf.all_tables}) dat5 = pdf2.copy_pan_dat(dat) self.assertTrue(pdf._same_data(dat, dat5)) self.assertTrue(pdf2._same_data(dat, dat5)) dat.commodities = dat.commodities.append(dat.commodities[dat.commodities["name"] == "Pencils"]) dat.arcs = dat.arcs.append(dat.arcs[dat.arcs["destination"] == "Boston"]) self.assertFalse(pdf2._same_data(dat, dat5)) self.assertFalse(pdf._same_data(dat, dat5))
def testFindDups(self): pdf = PanDatFactory(**sillyMeSchema()) tdf = TicDatFactory( **{ k: [[], list(pkfs) + list(dfs)] for k, (pkfs, dfs) in sillyMeSchema().items() }) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat) self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 2) dups = pdf.find_duplicates(panDat, as_table=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 1) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat, keep=False) self.assertTrue( set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue({k: v.value_counts()[True] for k, v in dups.items()} == { 'a': 3, 'b': 2 })
def testXToManyTwo(self): input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F2"] * 2, ["F3"] * 2]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-one", "many-to-one"}) rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]] tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 3 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue(len(fk_fails) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2]) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 4 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertTrue( len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testDataTypes(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [10,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 ticdat.nutritionQuantities['a', 2] = 12 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat)) self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001)) pdf = PanDatFactory(**dietSchema()) pdf.set_data_type("foods", "cost", nullable=False) pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) failed = pdf.find_data_type_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15}) self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0}) self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed="*") self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf.replace_data_type_failures(pandat) self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
def testDataTypes_two(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**tdf.schema()) def makeIt(): rtn = tdf.TicDat() rtn.foods["a"] = 12 rtn.foods["b"] = None rtn.foods[None] = 101 rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40} rtn.categories["2"] = [10, 20] for f, p in itertools.product(rtn.foods, rtn.categories): rtn.nutritionQuantities[f, p] = 5 rtn.nutritionQuantities['a', 2] = 12 return tdf.copy_to_pandas(rtn, drop_pk_columns=False) dat = makeIt() errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat)) dat_copied = pdf.copy_pan_dat(dat) pdf.replace_data_type_failures(dat) self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001)) pdf2 = pdf.clone() pdf2.set_default_value("foods", "name", "a") pdf2.set_default_value("nutritionQuantities", "food", "a") pdf2.replace_data_type_failures(dat_copied) self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001)) self.assertFalse(pdf.find_data_type_failures(dat_copied)) dups = pdf.find_duplicates(dat_copied) self.assertTrue( len(dups) == 2 and len(dups["foods"]) == 1 and len(dups["nutritionQuantities"]) == 2) from pandas import isnull def noneify(iter_of_tuples): return { tuple(None if isnull(_) else _ for _ in tuple_) for tuple_ in iter_of_tuples } self.assertTrue( noneify(errs['nutritionQuantities', 'food'].itertuples( index=False)) == {(None, "1", 5), (None, "2", 5)}) self.assertTrue( noneify(errs['foods', 'name'].itertuples(index=False)) == {(None, 101)}) pdf = PanDatFactory(**tdf.schema()) pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*') pdf.set_data_type("nutritionQuantities", "food", nullable=True, strings_allowed='*') self.assertFalse(pdf.find_data_type_failures(dat)) pdf.set_data_type("foods", "cost", nullable=False) errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 1) self.assertTrue( noneify(errs['foods', 'cost'].itertuples(index=False)) == {('b', None)})
def testXToMany(self): input_schema = PanDatFactory (roster = [["Name"],["Grade", "Arrival Inning", "Departure Inning", "Min Innings Played", "Max Innings Played"]], positions = [["Position"],["Position Importance", "Position Group", "Consecutive Innings Only"]], innings = [["Inning"],["Inning Group"]], position_constraints = [["Position Group", "Inning Group", "Grade"], ["Min Players", "Max Players"]]) input_schema.add_foreign_key("position_constraints", "roster", ["Grade", "Grade"]) input_schema.add_foreign_key("position_constraints", "positions", ["Position Group", "Position Group"]) input_schema.add_foreign_key("position_constraints", "innings", ["Inning Group", "Inning Group"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"many-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat() for i,p in enumerate(["bob", "joe", "fred", "alice", "lisa", "joean", "ginny"]): dat.roster[p]["Grade"] = (i%3)+1 dat.roster["dummy"]["Grade"] = "whatevers" for i,p in enumerate(["pitcher", "catcher", "1b", "2b", "ss", "3b", "lf", "cf", "rf"]): dat.positions[p]["Position Group"] = "PG %s"%((i%4)+1) for i in range(1, 10): dat.innings[i]["Inning Group"] = "before stretch" if i < 7 else "after stretch" dat.innings[0] ={} for pg, ig, g in itertools.product(["PG %s"%i for i in range(1,5)], ["before stretch", "after stretch"], [1, 2, 3]): dat.position_constraints[pg, ig, g] = {} orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.position_constraints["no", "no", "no"] = dat.position_constraints[1, 2, 3] = {} new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema._same_data(orig_pan_dat, new_pan_dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) fk_fails_2 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low") fk_fails_3 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low", as_table=False) self.assertTrue({tuple(k)[:2] + (tuple(k[2]),): len(v) for k,v in fk_fails.items()} == {k:len(v) for k,v in fk_fails_2.items()} == {k:v.count(True) for k,v in fk_fails_3.items()} == {('position_constraints', 'innings', ("Inning Group", "Inning Group")): 2, ('position_constraints', 'positions', ("Position Group", "Position Group")): 2, ('position_constraints', 'roster', ("Grade", "Grade")): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(table_one=[["One", "Two"], []], table_two=[["One"], ["Two"]]) input_schema.add_foreign_key("table_two", "table_one", ["One", "One"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(table_one = [[1,2], [3,4], [5,6], [7,8]], table_two = {1:2, 3:4, 5:6}) orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.table_two[9]=10 new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue({tuple(k)[:2]:len(v) for k,v in fk_fails.items()} == {('table_two', 'table_one'): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testDataPredicates(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [21,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) ticdat.nutritionQuantities['a', 2] = 12 ticdat.categories["3"] = ['a', 100] pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'}) failed = pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) failed = pdf.find_data_row_failures(pandat_2) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) perform_predicate_checks(dietSchema()) perform_predicate_checks({t:'*' for t in dietSchema()}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.add_data_row_predicate("arcs", lambda row: True, "capacity") self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"] pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
def testDataRowPredicatesTwo(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) num_calls=[0] mess_it_up=[] def pre_processor(dat): num_calls[0] += 1 if mess_it_up: dat.messing_it_up+=1 return {t:len(getattr(dat, t)) for t in tdf.all_tables} pdf.add_data_row_predicate("foods", lambda row, y: y==12, predicate_kwargs_maker=lambda dat: {"y":12}) pdf.add_data_row_predicate("categories", lambda row, nutritionQuantities, foods, categories: row["name"] == "fat" or categories == 4, predicate_name="catfat", predicate_kwargs_maker=pre_processor) pdf.add_data_row_predicate("foods", lambda row, nutritionQuantities, foods, categories: row["name"] == "pizza" or foods == 9, predicate_name= "foodza", predicate_kwargs_maker=pre_processor) def dummy_kwargs_maker(dat): if pdf.good_pan_dat_object(dat): return {"x":1} for t in tdf.all_tables: pdf.add_data_row_predicate(t, lambda row, x: x==1, predicate_name=f"dummy_{t}", predicate_kwargs_maker=dummy_kwargs_maker) pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData()))) self.assertFalse(pdf.find_data_row_failures(pandat)) self.assertTrue(num_calls[0] == 1) pandat.foods = pandat.foods[pandat.foods["name"] != "pizza"].copy() pandat.categories = pandat.categories[pandat.categories["name"] != "fat"].copy() fails = pdf.find_data_row_failures(pandat) self.assertTrue(num_calls[0] == 2) self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) self.assertTrue(set(fails['categories', 'catfat']["name"]) == set(dietData().categories).difference(["fat"])) self.assertTrue(set(fails['foods', 'foodza']["name"]) == set(dietData().foods).difference(["pizza"])) mess_it_up.append(1) ex = [] try: pdf.find_data_row_failures(pandat) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("AttributeError" in ex[0]) fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure") self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) self.assertTrue(num_calls[0] == 4) for v in fails.values(): self.assertTrue(v.primary_key == '*' and "no attribute" in v.error_message) pdf = pdf.clone() fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure") self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) mess_it_up=[] def fail_on_bad_name(row, bad_name): if row["name"] == bad_name: return f"{bad_name} is bad" return True pdf.add_data_row_predicate("foods", fail_on_bad_name, predicate_name="baddy", predicate_kwargs_maker=lambda dat: {"bad_name": sorted(dat.foods["name"])[0]}, predicate_failure_response="Error Message") pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData()))) fails = pdf.find_data_row_failures(pandat) self.assertTrue(set(map(tuple, fails)) == {('foods', 'baddy')}) self.assertTrue(len(fails['foods', 'baddy']) == 1) self.assertTrue(list(fails['foods', 'baddy']["Error Message"])[0] == "chicken is bad")
def testDataPredicates(self): # this test won't run properly if the -O flag is applied if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [21,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) ticdat.nutritionQuantities['a', 2] = 12 ticdat.categories["3"] = ['a', 100] pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") pdf2 = PanDatFactory(**sch) def make_error_message_predicate(f, name): def error_message_predicate(row): rtn = f(row) if rtn: return True return f"{name} failed!" return error_message_predicate for t, preds in pdf._data_row_predicates.items(): for p_name, rpi in preds.items(): pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name), predicate_name=p_name, predicate_failure_response="Error Message") failed = pdf.find_data_row_failures(pandat) failed2 = pdf2.find_data_row_failures(pandat) self.assertTrue(set(failed) == set(failed2) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'}) for f in [failed, failed2]: self.assertTrue(set({(v["food"], v["category"]) for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'}) for t, n in failed2: self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'}) for _pdf in [pdf, pdf2]: failed = _pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) ex = [] try: _pdf.find_data_row_failures(pandat_2) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("TypeError" in ex[0]) failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") df = failed['categories', 'minmax'] err_str = list(df[df['name'] == '3']["Error Message"])[0] self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>") perform_predicate_checks(dietSchema()) perform_predicate_checks({t:'*' for t in dietSchema()}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.add_data_row_predicate("arcs", lambda row: True, "capacity") self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"] pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf = PanDatFactory(table=[[],["Field", "Error Message", "Error Message (1)"]]) pdf.add_data_row_predicate("table", predicate=lambda row: f"Oops {row['Field']}" if row["Field"] > 1 else True, predicate_name="silly", predicate_failure_response="Error Message") df = DataFrame({"Field":[2, 1], "Error Message":["what", "go"], "Error Message (1)": ["now", "go"]}) fails = pdf.find_data_row_failures(pdf.PanDat(table=df)) df = fails["table", "silly"] self.assertTrue(list(df.columns) == ["Field", "Error Message", "Error Message (1)", "Error Message (2)"]) self.assertTrue(set(df["Field"]) == {2} and set(df["Error Message (2)"]) == {'Oops 2'})
def testCsvSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{ k: v for k, v in dietSchema().items() if k != "nutritionQuantities" }) panDat2 = pdf2.copy_pan_dat(panDat) dirPath = os.path.join(_scratchDir, "diet_missing_csv") pdf2.csv.write_directory(panDat2, dirPath, makeCleanDir(dirPath)) panDat3 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf2._same_data(panDat2, panDat3)) self.assertTrue(all(hasattr(panDat3, x) for x in pdf.all_tables)) self.assertFalse(len(panDat3.nutritionQuantities)) self.assertTrue(len(panDat3.categories) and len(panDat3.foods)) pdf2 = PanDatFactory( **{k: v for k, v in dietSchema().items() if k == "categories"}) panDat2 = pdf2.copy_pan_dat(panDat) pdf2.csv.write_directory(panDat2, dirPath, makeCleanDir(dirPath)) panDat3 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf2._same_data(panDat2, panDat3)) self.assertTrue(all(hasattr(panDat3, x) for x in pdf.all_tables)) self.assertFalse( len(panDat3.nutritionQuantities) or len(panDat3.foods)) self.assertTrue(len(panDat3.categories)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "netflow_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.csv.write_directory(panDat, dirPath) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath, decimal=",") panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertFalse(pdf._same_data(panDat, panDat2)) panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",") self.assertTrue(pdf._same_data(panDat, panDat2))