def test_parameters(self): core_path = os.path.join(_scratchDir, "parameters") pdf = PanDatFactory(parameters=[["Key"], ["Value"]]) pdf.add_parameter("Something", 100) pdf.add_parameter("Different", 'boo', strings_allowed='*', number_allowed=False) dat = TicDatFactory(**pdf.schema()).TicDat( parameters=[["Something", float("inf")], ["Different", "inf"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertTrue(pdf._same_data(dat, dat_1)) core_path = os.path.join(_scratchDir, "parameters_two") dat = TicDatFactory(**pdf.schema()).TicDat( parameters=[["Something", float("inf")], ["Different", "05701"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["json", core_path + ".json"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertTrue(pdf._same_data(dat, dat_1))
def test_nullables(self): core_path = os.path.join(_scratchDir, "nullables") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]]) pdf.set_data_type("table_with_stuffs", "field one") pdf.set_data_type("table_with_stuffs", "field two", number_allowed=False, strings_allowed='*', nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[101, "022"], [202, None], [303, "111"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse(pdf.find_data_type_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: f_or_d = "directory" if attr == "csv" else "file" write_func, write_kwargs = utils._get_write_function_and_kwargs( pdf, path, f_or_d) write_func(dat, path, **write_kwargs) dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d, False) self.assertTrue( pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
def testXToManyTwo(self): input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F2"] * 2, ["F3"] * 2]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-one", "many-to-one"}) rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]] tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 3 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue(len(fk_fails) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2]) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 4 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertTrue( len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def test_datetime(self): core_path = os.path.join(_scratchDir, "parameters") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) pdf.add_parameter("p1", "Dec 15 1970", datetime=True) pdf.add_parameter("p2", None, datetime=True, nullable=True) pdf.set_data_type("table_with_stuffs", "field one", datetime=True) pdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None], [ datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011") ]], parameters=[["p1", "7/11/1911"], ["p2", None]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse( pdf.find_data_type_failures(dat) or pdf.find_data_row_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertFalse(pdf._same_data(dat, dat_1)) self.assertFalse( pdf.find_data_type_failures(dat_1) or pdf.find_data_row_failures(dat_1)) dat_1 = pdf.copy_to_tic_dat(dat_1) self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'}) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], (datetime.datetime, numpy.datetime64)) and not pd.isnull(dat_1.parameters["p1"]["b"])) self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"])) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) and not pd.isnull(_) for _ in dat_1.table_with_stuffs)) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) or _ is None or utils.safe_apply(math.isnan)(_) for v in dat_1.table_with_stuffs.values() for _ in v.values())) self.assertTrue({ pd.isnull(_) for v in dat_1.table_with_stuffs.values() for _ in v.values() } == {True, False})
def testDietWithInfFlagging(self): diet_pdf = PanDatFactory(**dietSchema()) addDietDataTypes(diet_pdf) tdf = TicDatFactory(**dietSchema()) dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()), drop_pk_columns=False) diet_pdf.set_infinity_io_flag(999999999) core_path = os.path.join(_scratchDir, "diet_with_inf_flagging") diet_pdf.sql.write_file(dat, core_path + ".db") diet_pdf.csv.write_directory(dat, core_path + "_csv") diet_pdf.json.write_file(dat, core_path + ".json") diet_pdf.xls.write_file(dat, core_path + ".xlsx") for attr, f in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: dat_1 = getattr(diet_pdf, attr).create_pan_dat(f) self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = diet_pdf.clone() dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = PanDatFactory(**diet_pdf.schema()) dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5)) protein = dat_1.categories["name"] == "protein" self.assertTrue( list(dat_1.categories[protein]["maxNutrition"])[0] == 999999999) dat_1.categories.loc[protein, "maxNutrition"] = float("inf") self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
def make_pdf(): pdf = PanDatFactory(data_table = [["a"], ["b", "c"]], parameters = [["a"], ["b"]]) pdf.add_parameter("Something", 100, max=100, inclusive_max=True) pdf.add_parameter("Another thing", 5, must_be_int=True) pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False) pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*') return PanDatFactory.create_from_full_schema(pdf.schema(True))
def testIssue45(self): pdf = PanDatFactory(data=[["a"], ["b"]]) tdf = TicDatFactory(**pdf.schema()) dat_nums = tdf.copy_to_pandas( tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False) dat_strs = tdf.copy_to_pandas( tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]), drop_pk_columns=False) files = [ os.path.join(_scratchDir, _) for _ in ["dat_nums.xlsx", "dat_strs.xlsx"] ] pdf.xls.write_file(dat_nums, files[0]) pdf.xls.write_file(dat_strs, files[1]) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_mixed = tdf.copy_to_pandas( tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]), drop_pk_columns=False) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed)) pdf = PanDatFactory(data=[["a"], ["b"]]) csv_dirs = [ os.path.join(_scratchDir, _) for _ in ["dat_nums_csv", "dat_strs_csv"] ] pdf.csv.write_directory(dat_nums, csv_dirs[0]) pdf.csv.write_directory(dat_strs, csv_dirs[1]) dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
def testAdditionalFKs(self): pdf = PanDatFactory(pt1=[["F1"], []], pt2=[["F2"], []], pt3=[["F1", "F2"], []], pt4=[["F1"], ["F2"]], pt5=[[], ["F1", "F2"]]) for c in ["pt3", "pt4", "pt5"]: pdf.add_foreign_key(c, "pt1", ["F1", "F1"]) pdf.add_foreign_key(c, "pt2", ["F2", "F2"]) tdf = TicDatFactory(**pdf.schema()) def pan_dat_(_): rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _)) self.assertFalse(pdf.find_duplicates(rtn)) return rtn ticDat = tdf.TicDat(pt1=[1, 2, 3, 4], pt2=[5, 6, 7, 8]) for f1, f2 in itertools.product(range(1, 5), range(5, 9)): ticDat.pt3[f1, f2] = {} ticDat.pt4[f1] = f2 ticDat.pt5.append((f1, f2)) origDat = tdf.copy_tic_dat(ticDat, freeze_it=True) self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(origDat))) ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {} ticDat.pt4["no"] = 6 ticDat.pt4["nono"] = 6.01 panDat = pan_dat_(ticDat) fails1 = pdf.find_foreign_key_failures(panDat) self.assertTrue(fails1) pdf.remove_foreign_key_failures(panDat) self.assertFalse(pdf.find_foreign_key_failures(panDat)) self.assertTrue(pdf._same_data(panDat, pan_dat_(origDat))) orig_lens = {t: len(getattr(origDat, t)) for t in tdf.all_tables} ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {} ticDat.pt4["no"] = 6 ticDat.pt4["nono"] = 6.01 ticDat.pt5.append(("no", 6)) ticDat.pt5.append((1, "no")) panDat = pan_dat_(ticDat) fails2 = pdf.find_foreign_key_failures(panDat) self.assertTrue( set(fails1) != set(fails2) and set(fails1).issubset(fails2)) pdf.remove_foreign_key_failures(panDat) self.assertFalse(pdf.find_foreign_key_failures(panDat)) self.assertTrue({t: len(getattr(panDat, t)) for t in tdf.all_tables} == orig_lens)
def test_missing_tables(self): core_path = os.path.join(_scratchDir, "missing_tables") pdf_1 = PanDatFactory(this=[["Something"], ["Another"]]) pdf_2 = PanDatFactory( **dict(pdf_1.schema(), that=[["What", "Ever"], []])) dat = pdf_1.PanDat(this={ "Something": ["a", "b", "c"], "Another": [2, 3, 5] }) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf_1, attr), func)(dat, path) dat_1 = getattr(pdf_2, attr).create_pan_dat(path) self.assertTrue(pdf_1._same_data(dat, dat_1))
def testBasicFKs(self): for cloning in [True, False, "*"]: clone_me_maybe = lambda x : x.clone(tdf.all_tables if cloning == "*" else None) if cloning else x pdf = PanDatFactory(plants = [["name"], ["stuff", "otherstuff"]], lines = [["name"], ["plant", "weird stuff"]], line_descriptor = [["name"], ["booger"]], products = [["name"],["gover"]], production = [["line", "product"], ["min", "max"]], pureTestingTable = [[], ["line", "plant", "product", "something"]], extraProduction = [["line", "product"], ["extramin", "extramax"]], weirdProduction = [["line1", "line2", "product"], ["weirdmin", "weirdmax"]]) pdf.add_foreign_key("production", "lines", ("line", "name")) pdf.add_foreign_key("production", "products", ("product", "name")) pdf.add_foreign_key("lines", "plants", ("plant", "name")) pdf.add_foreign_key("line_descriptor", "lines", ("name", "name")) for f in set(pdf.data_fields["pureTestingTable"]).difference({"something"}): pdf.add_foreign_key("pureTestingTable", "%ss"%f, (f,"name")) pdf.add_foreign_key("extraProduction", "production", (("line", "line"), ("product","product"))) pdf.add_foreign_key("weirdProduction", "production", (("line1", "line"), ("product","product"))) pdf.add_foreign_key("weirdProduction", "extraProduction", (("line2","line"), ("product","product"))) self._testPdfReproduction(pdf) pdf = clone_me_maybe(pdf) tdf = TicDatFactory(**pdf.schema()) goodDat = tdf.TicDat() goodDat.plants["Cleveland"] = ["this", "that"] goodDat.plants["Newark"]["otherstuff"] =1 goodDat.products["widgets"] = goodDat.products["gadgets"] = "shizzle" for i,p in enumerate(goodDat.plants): goodDat.lines[i]["plant"] = p for i,(pl, pd) in enumerate(itertools.product(goodDat.lines, goodDat.products)): goodDat.production[pl, pd] = {"min":1, "max":10+i} badDat1 = tdf.copy_tic_dat(goodDat) badDat1.production["notaline", "widgets"] = [0,1] badDat2 = tdf.copy_tic_dat(badDat1) def pan_dat_(_): rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _)) self.assertFalse(pdf.find_duplicates(rtn)) return rtn fk, fkm = ForeignKey, ForeignKeyMapping fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1)) fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2)) self.assertTrue(set(fk_fails1) == set(fk_fails2) == {fk('production', 'lines', fkm('line', 'name'), 'many-to-one')}) self.assertTrue(set(pdf.find_foreign_key_failures(pan_dat_(badDat1), verbosity="Low")) == set(pdf.find_foreign_key_failures(pan_dat_(badDat2), verbosity="Low")) == {('production', 'lines', ('line', 'name'))}) for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]: self.assertTrue(set(row_fails["line"]) == {"notaline"} and set(row_fails["product"]) == {"widgets"}) badDat1.lines["notaline"]["plant"] = badDat2.lines["notaline"]["plant"] = "notnewark" fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1)) fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2)) self.assertTrue(set(fk_fails1) == set(fk_fails2) == {fk('lines', 'plants', fkm('plant', 'name'), 'many-to-one')}) for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]: self.assertTrue(set(row_fails["name"]) == {"notaline"} and set(row_fails["plant"]) == {"notnewark"}) for bad in [badDat1, badDat2]: bad_pan = pdf.remove_foreign_key_failures(pan_dat_(bad)) self.assertFalse(pdf.find_foreign_key_failures(bad_pan)) self.assertTrue(pdf._same_data(bad_pan, pan_dat_(goodDat))) _ = len(goodDat.lines) for i,p in enumerate(list(goodDat.plants.keys()) + list(goodDat.plants.keys())): goodDat.lines[i+_]["plant"] = p for l in goodDat.lines: if i%2: goodDat.line_descriptor[l] = i+10 for i,(l,pl,pdct) in enumerate(sorted(itertools.product(goodDat.lines, goodDat.plants, goodDat.products))): goodDat.pureTestingTable.append((l,pl,pdct,i)) self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(goodDat))) badDat = tdf.copy_tic_dat(goodDat) badDat.pureTestingTable.append(("j", "u", "nk", "ay")) fk_fails = pdf.find_foreign_key_failures(pan_dat_(badDat)) self.assertTrue(set(fk_fails) == {fk('pureTestingTable', 'plants', fkm('plant', 'name'), 'many-to-one'), fk('pureTestingTable', 'products', fkm('product', 'name'), 'many-to-one'), fk('pureTestingTable', 'lines', fkm('line', 'name'), 'many-to-one')}) for df in fk_fails.values(): df = df.T c = df.columns[0] self.assertTrue({'ay', 'j', 'nk', 'u'} == set(df[c]))
def testXToMany(self): input_schema = PanDatFactory (roster = [["Name"],["Grade", "Arrival Inning", "Departure Inning", "Min Innings Played", "Max Innings Played"]], positions = [["Position"],["Position Importance", "Position Group", "Consecutive Innings Only"]], innings = [["Inning"],["Inning Group"]], position_constraints = [["Position Group", "Inning Group", "Grade"], ["Min Players", "Max Players"]]) input_schema.add_foreign_key("position_constraints", "roster", ["Grade", "Grade"]) input_schema.add_foreign_key("position_constraints", "positions", ["Position Group", "Position Group"]) input_schema.add_foreign_key("position_constraints", "innings", ["Inning Group", "Inning Group"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"many-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat() for i,p in enumerate(["bob", "joe", "fred", "alice", "lisa", "joean", "ginny"]): dat.roster[p]["Grade"] = (i%3)+1 dat.roster["dummy"]["Grade"] = "whatevers" for i,p in enumerate(["pitcher", "catcher", "1b", "2b", "ss", "3b", "lf", "cf", "rf"]): dat.positions[p]["Position Group"] = "PG %s"%((i%4)+1) for i in range(1, 10): dat.innings[i]["Inning Group"] = "before stretch" if i < 7 else "after stretch" dat.innings[0] ={} for pg, ig, g in itertools.product(["PG %s"%i for i in range(1,5)], ["before stretch", "after stretch"], [1, 2, 3]): dat.position_constraints[pg, ig, g] = {} orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.position_constraints["no", "no", "no"] = dat.position_constraints[1, 2, 3] = {} new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema._same_data(orig_pan_dat, new_pan_dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) fk_fails_2 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low") fk_fails_3 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low", as_table=False) self.assertTrue({tuple(k)[:2] + (tuple(k[2]),): len(v) for k,v in fk_fails.items()} == {k:len(v) for k,v in fk_fails_2.items()} == {k:v.count(True) for k,v in fk_fails_3.items()} == {('position_constraints', 'innings', ("Inning Group", "Inning Group")): 2, ('position_constraints', 'positions', ("Position Group", "Position Group")): 2, ('position_constraints', 'roster', ("Grade", "Grade")): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(table_one=[["One", "Two"], []], table_two=[["One"], ["Two"]]) input_schema.add_foreign_key("table_two", "table_one", ["One", "One"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(table_one = [[1,2], [3,4], [5,6], [7,8]], table_two = {1:2, 3:4, 5:6}) orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.table_two[9]=10 new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue({tuple(k)[:2]:len(v) for k,v in fk_fails.items()} == {('table_two', 'table_one'): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testRoundTrips(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) tdf.enable_foreign_key_links() oldDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) pdf = PanDatFactory(table=[["a", "b"], ["c"]]) pan_dat = pdf.PanDat(table=utils.DataFrame({ "a": [1, 2, 1, 1], "b": [10, 10, 10, 11], "c": [101, 102, 103, 104] })) self.assertTrue( len(pdf.find_duplicates(pan_dat, keep=False)["table"]) == 2) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(len(tic_dat.table) == len(pan_dat.table) - 1) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [None, 2, 3], [2, 1, None]]) self.assertTrue(len(tic_dat.table) == 3) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [float("nan"), 2, 3], [2, 1, float("nan")]]) # this fails because _same_data isn't smart enough to check against nan in the keys, # because float("nan") != float("nan") self.assertFalse(tdf._same_data(tic_dat3, tic_dat_two)) pdf = PanDatFactory(table=[["a"], ["b", "c"]]) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [2, None, 3], [2, 1, None]]) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [2, float("nan"), 3], [2, 1, float("nan")]]) # _same_data works fine in checking nan equivalence in data rows - which maybe self.assertTrue( tdf._same_data(tic_dat3, tic_dat_two, nans_are_same_for_data_rows=True))
def testDefaultAdd(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) xlsFilePath = os.path.join(_scratchDir, "diet_add.xlsx") pdf.xls.write_file(panDat, xlsFilePath) sqlFilePath = os.path.join(_scratchDir, "diet_add.sql") pdf.sql.write_file(panDat, sqlFilePath) csvDirPath = os.path.join(_scratchDir, "diet_add_csv") pdf.csv.write_directory(panDat, csvDirPath, case_space_table_names=True) pdf2 = PanDatFactory( **{ k: [p, d] if k != "foods" else [p, list(d) + ["extra"]] for k, (p, d) in dietSchema().items() }) ex = self.firesException(lambda: pdf2.xls.create_pan_dat(xlsFilePath)) self.assertTrue("missing" in ex and "extra" in ex) ex = self.firesException(lambda: pdf2.sql.create_pan_dat(sqlFilePath)) self.assertTrue("missing" in ex and "extra" in ex) ex = self.firesException(lambda: pdf2.csv.create_pan_dat(csvDirPath)) self.assertTrue("missing" in ex and "extra" in ex) ex = self.firesException( lambda: pdf2.json.create_pan_dat(pdf.json.write_file(panDat, ""))) self.assertTrue("missing" in ex and "extra" in ex) panDat2 = pdf2.sql.create_pan_dat(sqlFilePath, fill_missing_fields=True) self.assertTrue(set(panDat2.foods["extra"]) == {0}) panDat2.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat2)) panDat2 = pdf2.xls.create_pan_dat(xlsFilePath, fill_missing_fields=True) self.assertTrue(set(panDat2.foods["extra"]) == {0}) panDat2.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat2)) panDat2 = pdf2.csv.create_pan_dat(csvDirPath, fill_missing_fields=True) self.assertTrue(set(panDat2.foods["extra"]) == {0}) panDat2.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat2)) panDat2 = pdf2.json.create_pan_dat(pdf.json.write_file(panDat, ""), fill_missing_fields=True) self.assertTrue(set(panDat2.foods["extra"]) == {0}) panDat2.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat2, epsilon=1e-5)) pdf3 = PanDatFactory(**pdf2.schema()) pdf3.set_default_value("foods", "extra", 13) panDat3 = pdf3.sql.create_pan_dat(sqlFilePath, fill_missing_fields=True) self.assertTrue(set(panDat3.foods["extra"]) == {13}) panDat3.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat3)) panDat3 = pdf3.xls.create_pan_dat(xlsFilePath, fill_missing_fields=True) self.assertTrue(set(panDat3.foods["extra"]) == {13}) panDat3.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat3)) panDat3 = pdf3.csv.create_pan_dat(csvDirPath, fill_missing_fields=True) self.assertTrue(set(panDat3.foods["extra"]) == {13}) panDat3.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat3)) panDat3 = pdf3.json.create_pan_dat(pdf.json.write_file(panDat, ""), fill_missing_fields=True) self.assertTrue(set(panDat3.foods["extra"]) == {13}) panDat3.foods.drop("extra", axis=1, inplace=True) self.assertTrue(pdf._same_data(panDat, panDat3, epsilon=1e-5))