def testSimple(self): if not self.canRun: return pdf = PanDatFactory(**netflowSchema()) _dat = netflowPandasData() dat = pdf.PanDat(**{t:getattr(_dat, t) for t in pdf.all_tables}) self.assertTrue(pdf.good_pan_dat_object(dat)) dat2 = pdf.copy_pan_dat(dat) self.assertTrue(pdf._same_data(dat, dat2)) self.assertTrue(pdf.good_pan_dat_object(dat2)) delattr(dat2, "nodes") msg = [] self.assertFalse(pdf.good_pan_dat_object(dat2, msg.append)) self.assertTrue(msg[-1] == "nodes not an attribute.") dat3 = pdf.copy_pan_dat(dat) dat3.cost.drop("commodity", axis=1, inplace=True) self.assertFalse(pdf.good_pan_dat_object(dat3, msg.append)) self.assertTrue("The following are (table, field) pairs missing from the data" in msg[-1]) dat4 = pdf.copy_pan_dat(dat) dat4.cost["cost"] += 1 self.assertFalse(pdf._same_data(dat, dat4)) pdf2 = PanDatFactory(**{t:'*' for t in pdf.all_tables}) dat5 = pdf2.copy_pan_dat(dat) self.assertTrue(pdf._same_data(dat, dat5)) self.assertTrue(pdf2._same_data(dat, dat5)) dat.commodities = dat.commodities.append(dat.commodities[dat.commodities["name"] == "Pencils"]) dat.arcs = dat.arcs.append(dat.arcs[dat.arcs["destination"] == "Boston"]) self.assertFalse(pdf2._same_data(dat, dat5)) self.assertFalse(pdf._same_data(dat, dat5))
def testDataTypes_two(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**tdf.schema()) def makeIt(): rtn = tdf.TicDat() rtn.foods["a"] = 12 rtn.foods["b"] = None rtn.foods[None] = 101 rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40} rtn.categories["2"] = [10, 20] for f, p in itertools.product(rtn.foods, rtn.categories): rtn.nutritionQuantities[f, p] = 5 rtn.nutritionQuantities['a', 2] = 12 return tdf.copy_to_pandas(rtn, drop_pk_columns=False) dat = makeIt() errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat)) dat_copied = pdf.copy_pan_dat(dat) pdf.replace_data_type_failures(dat) self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001)) pdf2 = pdf.clone() pdf2.set_default_value("foods", "name", "a") pdf2.set_default_value("nutritionQuantities", "food", "a") pdf2.replace_data_type_failures(dat_copied) self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001)) self.assertFalse(pdf.find_data_type_failures(dat_copied)) dups = pdf.find_duplicates(dat_copied) self.assertTrue( len(dups) == 2 and len(dups["foods"]) == 1 and len(dups["nutritionQuantities"]) == 2) from pandas import isnull def noneify(iter_of_tuples): return { tuple(None if isnull(_) else _ for _ in tuple_) for tuple_ in iter_of_tuples } self.assertTrue( noneify(errs['nutritionQuantities', 'food'].itertuples( index=False)) == {(None, "1", 5), (None, "2", 5)}) self.assertTrue( noneify(errs['foods', 'name'].itertuples(index=False)) == {(None, 101)}) pdf = PanDatFactory(**tdf.schema()) pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*') pdf.set_data_type("nutritionQuantities", "food", nullable=True, strings_allowed='*') self.assertFalse(pdf.find_data_type_failures(dat)) pdf.set_data_type("foods", "cost", nullable=False) errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 1) self.assertTrue( noneify(errs['foods', 'cost'].itertuples(index=False)) == {('b', None)})
def testXlsSpacey(self): if not self.can_run: return tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat(**spacesData()) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".xlsx" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2))
def testDietOpalytics(self): if not self.can_run: return for hack, raw_data, activeEnabled in list( itertools.product(*(([True, False], ) * 3))): tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData())) inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled) pdf = PanDatFactory(**dietSchema()) panDat = pdf.opalytics.create_pan_dat(inputset) self.assertFalse(pdf.find_duplicates(panDat)) ticDat2 = pdf.copy_to_tic_dat(panDat) self.assertTrue(tdf._same_data(ticDat, ticDat2)) tdf2 = TicDatFactory( **{ k: [pks, list(dfs) + ["dmy"]] for k, (pks, dfs) in tdf.schema().items() }) _dat = tdf2.copy_tic_dat(ticDat) panDat = pdf.opalytics.create_pan_dat( create_inputset_mock(tdf2, _dat, hack)) self.assertTrue(tdf._same_data(ticDat, pdf.copy_to_tic_dat(panDat))) pdf2 = PanDatFactory(**tdf2.schema()) ex = self.firesException(lambda: pdf2.opalytics.create_pan_dat( inputset, raw_data=raw_data)) self.assertTrue( all(_ in ex for _ in ["(table, field) pairs missing"] + ["'%s', 'dmy'" % _ for _ in pdf2.all_tables]))
def testVariousCoverages(self): pdf = PanDatFactory(**dietSchema()) _d = dict(categories={ "minNutrition": 0, "maxNutrition": float("inf") }, foods={"cost": 0}, nutritionQuantities={"qty": 0}) pdf.set_default_values(**_d) self.assertTrue(pdf._default_values == _d) pdf = PanDatFactory(**netflowSchema()) addNetflowForeignKeys(pdf) pdf.clear_foreign_keys("arcs") self.assertTrue({_[0] for _ in pdf._foreign_keys} == {"cost", "inflow"}) pdf.add_data_row_predicate("arcs", lambda row: True) pdf.add_data_row_predicate("arcs", lambda row: True, "dummy") pdf.add_data_row_predicate("arcs", None, 0) pdf = pdf.clone() self.assertTrue(set(pdf._data_row_predicates["arcs"]) == {"dummy"}) pdf = PanDatFactory(pdf_table_one=[["A Field"], []], pdf_table_two=[["B Field"], []], pdf_table_three=[["C Field"], []]) pdf.add_foreign_key("pdf_table_one", "pdf_table_two", ["A Field", "B Field"]) pdf.add_foreign_key("pdf_table_two", "pdf_table_three", ["B Field", "C Field"]) pdf.add_foreign_key("pdf_table_three", "pdf_table_one", ["C Field", "A Field"])
def testSqlSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.db") pdf.sql.write_file(panDat, filePath) sqlPanDat = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.sql.write_file(panDat, filePath) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.db") pdf.sql.write_file(panDat, filePath) panDat2 = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat))
def testXToManyTwo(self): input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F2"] * 2, ["F3"] * 2]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-one", "many-to-one"}) rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]] tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 3 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue(len(fk_fails) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2]) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 4 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertTrue( len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testDietWithInfFlagging(self): diet_pdf = PanDatFactory(**dietSchema()) addDietDataTypes(diet_pdf) tdf = TicDatFactory(**dietSchema()) dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()), drop_pk_columns=False) diet_pdf.set_infinity_io_flag(999999999) core_path = os.path.join(_scratchDir, "diet_with_inf_flagging") diet_pdf.sql.write_file(dat, core_path + ".db") diet_pdf.csv.write_directory(dat, core_path + "_csv") diet_pdf.json.write_file(dat, core_path + ".json") diet_pdf.xls.write_file(dat, core_path + ".xlsx") for attr, f in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: dat_1 = getattr(diet_pdf, attr).create_pan_dat(f) self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = diet_pdf.clone() dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = PanDatFactory(**diet_pdf.schema()) dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5)) protein = dat_1.categories["name"] == "protein" self.assertTrue( list(dat_1.categories[protein]["maxNutrition"])[0] == 999999999) dat_1.categories.loc[protein, "maxNutrition"] = float("inf") self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
def testDictConstructions(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) panDat3_1 = pdf.PanDat(**{t:list(map(list, getattr(panDat, t).itertuples(index=False))) for t in pdf.all_tables}) self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3, panDat3_1])) panDat.foods["extra"] = 12 panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.foods["extra"]) == {12}) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="records") for t in pdf.all_tables}) self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3])) panDat.cost["extra"] = "boger" panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.cost["extra"]) == {"boger"})
def testDataTypes(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [10,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 ticdat.nutritionQuantities['a', 2] = 12 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat)) self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001)) pdf = PanDatFactory(**dietSchema()) pdf.set_data_type("foods", "cost", nullable=False) pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) failed = pdf.find_data_type_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15}) self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0}) self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed="*") self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf.replace_data_type_failures(pandat) self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
def testIssue45(self): pdf = PanDatFactory(data=[["a"], ["b"]]) tdf = TicDatFactory(**pdf.schema()) dat_nums = tdf.copy_to_pandas( tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False) dat_strs = tdf.copy_to_pandas( tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]), drop_pk_columns=False) files = [ os.path.join(_scratchDir, _) for _ in ["dat_nums.xlsx", "dat_strs.xlsx"] ] pdf.xls.write_file(dat_nums, files[0]) pdf.xls.write_file(dat_strs, files[1]) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_mixed = tdf.copy_to_pandas( tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]), drop_pk_columns=False) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed)) pdf = PanDatFactory(data=[["a"], ["b"]]) csv_dirs = [ os.path.join(_scratchDir, _) for _ in ["dat_nums_csv", "dat_strs_csv"] ] pdf.csv.write_directory(dat_nums, csv_dirs[0]) pdf.csv.write_directory(dat_strs, csv_dirs[1]) dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
def testSqlSpaceyTwo(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".db" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path=None, con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path="", con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(None, con) self.assertTrue(pdf._same_data(panDat, panDat2))
def testInfFlagging(self): pdf = PanDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: pdf.set_data_type("table", f, nullable=True) def make_dat(l): tdf = TicDatFactory(**pdf.schema()) return tdf.copy_to_pandas(tdf.TicDat(table=l), drop_pk_columns=False) dat = make_dat([[None, 100], [200, 109], [0, 300], [300, None], [400, 0]]) core_path = os.path.join(_scratchDir, "non_inf_flagging") for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) _ = PanDatFactory(table=[[], ["field one", "field two"]]) self.assertTrue( _._same_data(dat, dat_1, nans_are_same_for_data_rows=True)) pdf_ = PanDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: pdf_.set_data_type("table", f, max=float("inf"), inclusive_max=True) pdf_.set_infinity_io_flag(None) dat_inf = make_dat([[float("inf"), 100], [200, 109], [0, 300], [300, float("inf")], [400, 0]]) dat_1 = getattr(pdf_, attr).create_pan_dat(path) self.assertTrue(pdf._same_data(dat_inf, dat_1)) getattr(getattr(pdf_, attr), func)(dat, path) dat_1 = getattr(pdf_, attr).create_pan_dat(path) # self.assertTrue(pdf._same_data(dat_inf, dat_1)) pdf_ = PanDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: pdf_.set_data_type("table", f, min=-float("inf"), inclusive_min=True) pdf_.set_infinity_io_flag(None) dat_1 = getattr(pdf_, attr).create_pan_dat(path) # self.assertFalse(pdf._same_data(dat_inf, dat_1)) dat_inf = make_dat([[float("-inf"), 100], [200, 109], [0, 300], [300, -float("inf")], [400, 0]]) self.assertTrue(pdf._same_data(dat_inf, dat_1))
def testXlsSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.xlsx") pdf.xls.write_file(panDat, filePath) xlsPanDat = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) pdf_shrunk = PanDatFactory(**{ k: v for k, v in dietSchema().items() if k != "nutritionQuantities" }) self.assertTrue(len(pdf_shrunk.all_tables) == len(pdf.all_tables) - 1) xlsPanDatShrunk = pdf_shrunk.xls.create_pan_dat(filePath) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) filePathShrunk = os.path.join(_scratchDir, "diet_shrunk.xlsx") self.assertTrue( self.firesException( lambda: pdf.xls.create_pan_dat(filePathShrunk))) pdf_shrunk.xls.write_file(panDat, filePathShrunk) xlsPanDatShrunk = pdf.xls.create_pan_dat(filePathShrunk) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.xls.write_file(panDat, filePath) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.xlsx") pdf.xls.write_file(panDat, filePath) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat))
def testCsvSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "netflow_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.csv.write_directory(panDat, dirPath) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath, decimal=",") panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertFalse(pdf._same_data(panDat, panDat2)) panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",") self.assertTrue(pdf._same_data(panDat, panDat2))
def testFindDups(self): pdf = PanDatFactory(**sillyMeSchema()) tdf = TicDatFactory( **{ k: [[], list(pkfs) + list(dfs)] for k, (pkfs, dfs) in sillyMeSchema().items() }) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat) self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 2) dups = pdf.find_duplicates(panDat, as_table=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 1) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat, keep=False) self.assertTrue( set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue({k: v.value_counts()[True] for k, v in dups.items()} == { 'a': 3, 'b': 2 })
def test_parameters(self): core_path = os.path.join(_scratchDir, "parameters") pdf = PanDatFactory(parameters=[["Key"], ["Value"]]) pdf.add_parameter("Something", 100) pdf.add_parameter("Different", 'boo', strings_allowed='*', number_allowed=False) dat = TicDatFactory(**pdf.schema()).TicDat( parameters=[["Something", float("inf")], ["Different", "inf"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertTrue(pdf._same_data(dat, dat_1)) core_path = os.path.join(_scratchDir, "parameters_two") dat = TicDatFactory(**pdf.schema()).TicDat( parameters=[["Something", float("inf")], ["Different", "05701"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["json", core_path + ".json"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertTrue(pdf._same_data(dat, dat_1))
def testDietCleaningOpalytics(self): sch = dietSchema() sch["categories"][-1].append("_active") tdf1 = TicDatFactory(**dietSchema()) tdf2 = TicDatFactory(**sch) ticDat2 = tdf2.copy_tic_dat(dietData()) for v in ticDat2.categories.values(): v["_active"] = True ticDat2.categories["fat"]["_active"] = False ticDat1 = tdf1.copy_tic_dat(dietData()) input_set = create_inputset_mock_with_active_hack(tdf2, ticDat2) pdf1 = PanDatFactory(**tdf1.schema()) panDat = pdf1.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf1._same_data(pdf1.copy_to_tic_dat(panDat), ticDat1)) panDatPurged = pdf1.opalytics.create_pan_dat(input_set) self.assertFalse( tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1)) ticDat1.categories.pop("fat") tdf1.remove_foreign_key_failures(ticDat1) self.assertTrue( tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))
def testDietCleaningOpalyticsTwo(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) tdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) ticDat = tdf.copy_tic_dat(dietData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**dietSchema()) addDietForeignKeys(pdf) pdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDietCleaningOpalytisThree(self): tdf = TicDatFactory(**dietSchema()) tdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(tdf) ticDat = tdf.copy_tic_dat(dietData()) pdf = PanDatFactory(**tdf.schema()) pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(pdf) input_set = create_inputset_mock(tdf, ticDat) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDupsOpalytics(self): if not self.can_run: return for hack in [True, False]: tdf = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], []]) tdf2 = TicDatFactory( **{t: [[], ["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) inputset = create_inputset_mock(tdf2, td, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=True) self.assertTrue( all(len(getattr(panDat, t)) == 6 for t in tdf.all_tables)) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=False) self.assertTrue( all(len(getattr(panDat, t)) < 6 for t in tdf.all_tables)) td_1 = tdf.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) td_2 = pdf.copy_to_tic_dat(panDat) self.assertTrue( all( set(getattr(td_1, t)) == set(getattr(td_2, t)) for t in tdf.all_tables))
def testSillyCleaningOpalyticsOne(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def test_data_row_max_failures(self): pdf = PanDatFactory(table_one=[["Field"], []], table_two=[[], ["Field"]]) for t in ["table_one", "table_two"]: pdf.set_data_type(t, "Field") for table, dts in pdf.data_types.items(): for field, dt in dts.items(): if table == "table_one": pdf.add_data_row_predicate( table, lambda row: dt.valid_data(row["Field"])) else: pdf.add_data_row_predicate( table, lambda row: True if not dt.valid_data(row["Field"]) else "Oops", predicate_failure_response="Error Message") dat = pdf.PanDat(table_one=DataFrame( {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}), table_two=DataFrame( {"Field": [10.1] * 10 + [-2] * 10})) errs = pdf.find_data_row_failures(dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=11) self.assertTrue(len(errs) == 2) self.assertTrue( any(len(_) == 10 for _ in errs.values()) and any(len(_) == 1 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def test_nullables(self): core_path = os.path.join(_scratchDir, "nullables") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]]) pdf.set_data_type("table_with_stuffs", "field one") pdf.set_data_type("table_with_stuffs", "field two", number_allowed=False, strings_allowed='*', nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[101, "022"], [202, None], [303, "111"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse(pdf.find_data_type_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: f_or_d = "directory" if attr == "csv" else "file" write_func, write_kwargs = utils._get_write_function_and_kwargs( pdf, path, f_or_d) write_func(dat, path, **write_kwargs) dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d, False) self.assertTrue( pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") pdf2 = PanDatFactory(**sch) def make_error_message_predicate(f, name): def error_message_predicate(row): rtn = f(row) if rtn: return True return f"{name} failed!" return error_message_predicate for t, preds in pdf._data_row_predicates.items(): for p_name, rpi in preds.items(): pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name), predicate_name=p_name, predicate_failure_response="Error Message") failed = pdf.find_data_row_failures(pandat) failed2 = pdf2.find_data_row_failures(pandat) self.assertTrue(set(failed) == set(failed2) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'}) for f in [failed, failed2]: self.assertTrue(set({(v["food"], v["category"]) for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'}) for t, n in failed2: self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'}) for _pdf in [pdf, pdf2]: failed = _pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) ex = [] try: _pdf.find_data_row_failures(pandat_2) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("TypeError" in ex[0]) failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") df = failed['categories', 'minmax'] err_str = list(df[df['name'] == '3']["Error Message"])[0] self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")
def test_missing_tables(self): core_path = os.path.join(_scratchDir, "missing_tables") pdf_1 = PanDatFactory(this=[["Something"], ["Another"]]) pdf_2 = PanDatFactory( **dict(pdf_1.schema(), that=[["What", "Ever"], []])) dat = pdf_1.PanDat(this={ "Something": ["a", "b", "c"], "Another": [2, 3, 5] }) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf_1, attr), func)(dat, path) dat_1 = getattr(pdf_2, attr).create_pan_dat(path) self.assertTrue(pdf_1._same_data(dat, dat_1))
def make_pdf(): pdf = PanDatFactory(data_table = [["a"], ["b", "c"]], parameters = [["a"], ["b"]]) pdf.add_parameter("Something", 100, max=100, inclusive_max=True) pdf.add_parameter("Another thing", 5, must_be_int=True) pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False) pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*') return PanDatFactory.create_from_full_schema(pdf.schema(True))
def test_datetime(self): core_path = os.path.join(_scratchDir, "parameters") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) pdf.add_parameter("p1", "Dec 15 1970", datetime=True) pdf.add_parameter("p2", None, datetime=True, nullable=True) pdf.set_data_type("table_with_stuffs", "field one", datetime=True) pdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None], [ datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011") ]], parameters=[["p1", "7/11/1911"], ["p2", None]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse( pdf.find_data_type_failures(dat) or pdf.find_data_row_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertFalse(pdf._same_data(dat, dat_1)) self.assertFalse( pdf.find_data_type_failures(dat_1) or pdf.find_data_row_failures(dat_1)) dat_1 = pdf.copy_to_tic_dat(dat_1) self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'}) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], (datetime.datetime, numpy.datetime64)) and not pd.isnull(dat_1.parameters["p1"]["b"])) self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"])) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) and not pd.isnull(_) for _ in dat_1.table_with_stuffs)) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) or _ is None or utils.safe_apply(math.isnan)(_) for v in dat_1.table_with_stuffs.values() for _ in v.values())) self.assertTrue({ pd.isnull(_) for v in dat_1.table_with_stuffs.values() for _ in v.values() } == {True, False})
def testNetflowOpalytics(self): if not self.can_run: return for hack, raw_data in list(itertools.product(*(([True, False], ) * 2))): tdf = TicDatFactory(**netflowSchema()) ticDat = tdf.copy_tic_dat(netflowData()) inputset = create_inputset_mock(tdf, ticDat, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data) self.assertTrue(tdf._same_data(ticDat, pdf.copy_to_tic_dat(panDat))) ticDat.nodes[12] = {} inputset = create_inputset_mock(tdf, ticDat, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data) self.assertTrue(tdf._same_data(ticDat, pdf.copy_to_tic_dat(panDat)))
def testCsvSpacey(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) dirPath = os.path.join(_scratchDir, "spaces_2_csv") pdf.csv.write_directory(panDat, dirPath, case_space_table_names=True) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "spaces_2_2_csv") pdf.csv.write_directory(panDat, dirPath, case_space_table_names=True, sep=":") panDat2 = pdf.csv.create_pan_dat(dirPath, sep=":") self.assertTrue(pdf._same_data(panDat, panDat2))