def testNetflow(self): if not self.canRun: return tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) self._test_generic_free_copy(oldDat, tdf) self._test_generic_free_copy(oldDat, tdf, ["arcs", "nodes"]) ticDat = tdf.copy_to_pandas(oldDat, ["arcs", "cost"]) self.assertTrue(all(hasattr(ticDat, t) == (t in ["arcs", "cost"]) for t in tdf.all_tables)) self.assertTrue(len(ticDat.arcs.capacity.sloc["Boston",:]) == len(oldDat.nodes["Boston"].arcs_source) == 0) self.assertTrue(len(ticDat.arcs.capacity.sloc[:,"Boston"]) == len(oldDat.nodes["Boston"].arcs_destination) == 2) self.assertTrue(all(ticDat.arcs.capacity.sloc[:,"Boston"][src] == r["capacity"] for src, r in oldDat.nodes["Boston"].arcs_destination.items())) ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=True) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) # because we have single pk field tables, dropping the pk columns is probelmatic self.assertFalse(tdf._same_data(rebornTicDat, oldDat)) # but with the default argument all is well ticDat = tdf.copy_to_pandas(oldDat) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) self.assertTrue(set(ticDat.inflow.columns) == {"quantity"}) self.assertTrue(set(ticDat.nodes.columns) == {"name"})
def testDenormalizedErrors(self): if not self.canRun: return c = clean_denormalization_errors f = utils.find_denormalized_sub_table_failures tdf = TicDatFactory(**spacesSchema()) dat = tdf.TicDat(**spacesData()) p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).b_table self.assertFalse(f(p(),"b Field 1",("b Field 2", "b Field 3"))) dat.b_table[2,2,3] = "boger" self.assertFalse(f(p(), "b Field 1",("b Field 2", "b Field 3"))) chk = f(p(), "b Field 2",("b Field 1", "b Field 3")) self.assertTrue(c(chk) == {2: {'b Field 1': {1, 2}}}) dat.b_table[2,2,4] = "boger" dat.b_table[1,'b','b'] = "boger" chk = f(p(), ["b Field 2"],("b Field 1", "b Field 3", "b Data")) self.assertTrue(c(chk) == c({2: {'b Field 3': (3, 4), 'b Data': (1, 'boger'), 'b Field 1': (1, 2)}, 'b': {'b Data': ('boger', 12), 'b Field 1': ('a', 1)}})) ex = self.firesException(lambda : f(p(), ["b Data"],"wtf")) self.assertTrue("wtf isn't a column" in ex) p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).c_table chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"]) self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}}}) dat.c_table.append((1, 2, 3, 4)) dat.c_table.append((1, 2, 1, 4)) dat.c_table.append((1, 2, 1, 5)) dat.c_table.append((1, 2, 3, 6)) chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"]) self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}}, (1,2):{'c Data 3':{3,1}, 'c Data 4':{4,5,6}}})
def testIssue45(self): pdf = PanDatFactory(data=[["a"], ["b"]]) tdf = TicDatFactory(**pdf.schema()) dat_nums = tdf.copy_to_pandas( tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False) dat_strs = tdf.copy_to_pandas( tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]), drop_pk_columns=False) files = [ os.path.join(_scratchDir, _) for _ in ["dat_nums.xlsx", "dat_strs.xlsx"] ] pdf.xls.write_file(dat_nums, files[0]) pdf.xls.write_file(dat_strs, files[1]) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_mixed = tdf.copy_to_pandas( tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]), drop_pk_columns=False) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed)) pdf = PanDatFactory(data=[["a"], ["b"]]) csv_dirs = [ os.path.join(_scratchDir, _) for _ in ["dat_nums_csv", "dat_strs_csv"] ] pdf.csv.write_directory(dat_nums, csv_dirs[0]) pdf.csv.write_directory(dat_strs, csv_dirs[1]) dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
def _test_generic_copy(self, ticDat, tdf, skip_tables=None): assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables) path = makeCleanDir(os.path.join(_scratchDir, "generic_copy")) replace_name = lambda f : "name_" if f == "name" else f clean_tdf = TicDatFactory(**{t:[list(map(replace_name, pks)), dfs] for t,(pks, dfs) in tdf.schema().items()}) temp_tdf = TicDatFactory(**{t:v if t in (skip_tables or []) else '*' for t,v in clean_tdf.schema().items()}) temp_dat = temp_tdf.TicDat(**{t:getattr(ticDat, t) for t in (skip_tables or [])}) for t in temp_tdf.generic_tables: setattr(temp_dat, t, getattr(clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False) ,t)) temp_tdf.sql.write_db_data(temp_dat, os.path.join(path, "f.db")) temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f1.sql"), include_schema=False) temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f2.sql"), include_schema=True) for file_name, includes_schema in [("f.db", False), ("f1.sql", False), ("f2.sql", True)]: file_path = os.path.join(path, file_name) if file_path.endswith(".db"): self.assertFalse(temp_tdf.sql.find_duplicates(file_path)) read_dat = temp_tdf.sql.create_tic_dat(file_path) else: read_dat = temp_tdf.sql.create_tic_dat_from_sql(file_path, includes_schema) generic_free_dat, _ = utils.create_generic_free(read_dat, temp_tdf) check_dat = clean_tdf.TicDat() for t in temp_tdf.generic_tables: for r in getattr(generic_free_dat, t): pks = clean_tdf.primary_key_fields[t] getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \ {df:r[df] for df in clean_tdf.data_fields.get(t, [])} for t in (skip_tables or []): for k,v in getattr(generic_free_dat, t).items(): getattr(check_dat, t)[k] = v self.assertTrue(clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
def _test_generic_free_copy(self, ticDat, tdf, skip_tables=None): assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables) replace_name = lambda f: "name_" if f == "name" else f clean_tdf = TicDatFactory( **{ t: [list(map(replace_name, pks)), dfs] for t, (pks, dfs) in tdf.schema().items() }) temp_tdf = TicDatFactory( **{ t: v if t in (skip_tables or []) else '*' for t, v in clean_tdf.schema().items() }) temp_dat = temp_tdf.TicDat( **{t: getattr(ticDat, t) for t in (skip_tables or [])}) for t in temp_tdf.generic_tables: setattr( temp_dat, t, getattr( clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False), t)) generic_free_dat, _ = utils.create_generic_free(temp_dat, temp_tdf) check_dat = clean_tdf.TicDat() for t in temp_tdf.generic_tables: for r in getattr(generic_free_dat, t): pks = clean_tdf.primary_key_fields[t] getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \ {df:r[df] for df in clean_tdf.data_fields.get(t, [])} for t in (skip_tables or []): for k, v in getattr(generic_free_dat, t).items(): getattr(check_dat, t)[k] = v self.assertTrue( clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
def testDietWithInfFlagging(self): diet_pdf = PanDatFactory(**dietSchema()) addDietDataTypes(diet_pdf) tdf = TicDatFactory(**dietSchema()) dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()), drop_pk_columns=False) diet_pdf.set_infinity_io_flag(999999999) core_path = os.path.join(_scratchDir, "diet_with_inf_flagging") diet_pdf.sql.write_file(dat, core_path + ".db") diet_pdf.csv.write_directory(dat, core_path + "_csv") diet_pdf.json.write_file(dat, core_path + ".json") diet_pdf.xls.write_file(dat, core_path + ".xlsx") for attr, f in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: dat_1 = getattr(diet_pdf, attr).create_pan_dat(f) self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = diet_pdf.clone() dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = PanDatFactory(**diet_pdf.schema()) dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5)) protein = dat_1.categories["name"] == "protein" self.assertTrue( list(dat_1.categories[protein]["maxNutrition"])[0] == 999999999) dat_1.categories.loc[protein, "maxNutrition"] = float("inf") self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
def testSilly(self): if not self.canRun: return tdf = TicDatFactory(**dict({"d" : [("dData1", "dData2", "dData3", "dData4"),[]], "e" : [["eData"],[]]}, **sillyMeSchema())) ticDat = tdf.copy_to_pandas(tdf.TicDat(**sillyMeData())) self.assertFalse(len(ticDat.d) + len(ticDat.e)) oldDat = tdf.freeze_me(tdf.TicDat(**dict({"d" : {(1,2,3,4):{}, (1, "b","c","d"):{}, ("a", 2,"c","d"):{}}, "e" : {11:{},"boger":{}}}, **sillyMeData()))) ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=True) def checkTicDat(): self.assertTrue(len(ticDat.d) ==3 and len(ticDat.e) == 2) self.assertTrue(set(ticDat.d.index.values) == {(1,2,3,4), (1, "b","c","d"), ("a", 2,"c","d")}) self.assertTrue(set(ticDat.e.index.values) == {11,"boger"}) self.assertTrue(len(ticDat.c) == len(oldDat.c) == 3) self.assertTrue(ticDat.c.loc[i] == oldDat.c[i] for i in range(3)) checkTicDat() self.assertFalse(hasattr(ticDat.d, "dData1") or hasattr(ticDat.e, "eData")) ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) checkTicDat() self.assertTrue(ticDat.e.loc[11].values[0] == 11) if sys.version_info[0] == 2: self.assertTrue(len(ticDat.d.dData1.sloc[1,:,:,:]) == 2) else : # very strange infrequent bug issue that I will investigate later self.assertTrue(len(ticDat.d.dData1.sloc[1]) == 2) ticDat = tdf.copy_to_pandas(oldDat) checkTicDat() if sys.version_info[0] == 2: self.assertTrue(len(ticDat.d.dData1.sloc[1,:,:,:]) == 2) else: self.assertTrue(len(ticDat.d.dData1.sloc[1]) == 2) self.assertTrue(ticDat.e.loc[11].values[0] == 11) self.assertTrue(set(ticDat.d.columns) == {"dData%s"%s for s in range(5)[1:]}) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) ticDat.b = ticDat.b.bData rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat))
def test_fk_max_failures(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) dat = tdf.TicDat(nutritionQuantities=[[f"food_{_}", f"cat_{_}", 10] for _ in range(10)]) pan_dat = tdf.copy_to_pandas(dat, drop_pk_columns=False) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) errs = pdf.find_foreign_key_failures(pan_dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=11) self.assertTrue( len(errs) == 2 and set(map(len, errs.values())) == {10, 1}) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def testDiet(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) tdf.enable_foreign_key_links() oldDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) self._test_generic_free_copy(oldDat, tdf) self._test_generic_free_copy(oldDat, tdf, ["nutritionQuantities"]) ticDat = tdf.copy_to_pandas(oldDat) for k in oldDat.foods: self.assertTrue(oldDat.foods[k]["cost"] == ticDat.foods.cost[k]) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == ticDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == ticDat.nutritionQuantities.qty[k1, k2]) nut = ticDat.nutritionQuantities self.assertTrue(firesException(lambda: nut.qty.loc[:, "fatty"])) self.assertTrue(firesException(lambda: nut.qty.loc["chickeny", :])) self.assertFalse(firesException(lambda: nut.qty.sloc[:, "fatty"])) self.assertFalse(firesException(lambda: nut.qty.sloc["chickeny", :])) self.assertTrue(0 == sum(nut.qty.sloc[:, "fatty"]) == sum(nut.qty.sloc[ "chickeny", :])) self.assertTrue( sum(nut.qty.sloc[:, "fat"]) == sum(nut.qty.loc[:, "fat"]) == sum( r["qty"] for (f, c), r in oldDat.nutritionQuantities.items() if c == "fat")) self.assertTrue( sum(nut.qty.sloc["chicken", :]) == sum(nut.qty.loc["chicken", :]) == sum(r["qty"] for (f, c), r in oldDat.nutritionQuantities.items() if f == "chicken")) rebornTicDat = tdf.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) tdf2 = TicDatFactory(**{t: '*' for t in tdf.all_tables}) self.assertTrue( firesException( lambda: tdf2.set_data_type("nutritionQuantities", "qty"))) genTicDat = tdf2.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == genTicDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == genTicDat.nutritionQuantities.qty[k1, k2]) self.assertFalse(tdf.good_tic_dat_object(genTicDat)) self.assertTrue(tdf2.good_tic_dat_object(genTicDat)) rebornTicDat = tdf.TicDat( **{t: getattr(genTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) rebornGenTicDat = tdf2.TicDat(**tdf2.as_dict(genTicDat)) for t, pks in tdf.primary_key_fields.items(): getattr(rebornGenTicDat, t).index.names = pks rebornTicDat = tdf.TicDat( **{t: getattr(rebornGenTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) tdf3 = TicDatFactory(**dict(dietSchema(), **{"categories": '*'})) self.assertFalse( firesException( lambda: tdf3.set_data_type("nutritionQuantities", "qty"))) mixTicDat = tdf3.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == mixTicDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == mixTicDat.nutritionQuantities[k1, k2]["qty"]) self.assertFalse(tdf2.good_tic_dat_object(mixTicDat)) self.assertFalse(tdf3.good_tic_dat_object(genTicDat)) self.assertTrue(tdf3.good_tic_dat_object(mixTicDat)) rebornTicDat = tdf.TicDat( **{t: getattr(mixTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat))
def testRoundTrips(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) tdf.enable_foreign_key_links() oldDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) pdf = PanDatFactory(table=[["a", "b"], ["c"]]) pan_dat = pdf.PanDat(table=utils.DataFrame({ "a": [1, 2, 1, 1], "b": [10, 10, 10, 11], "c": [101, 102, 103, 104] })) self.assertTrue( len(pdf.find_duplicates(pan_dat, keep=False)["table"]) == 2) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(len(tic_dat.table) == len(pan_dat.table) - 1) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [None, 2, 3], [2, 1, None]]) self.assertTrue(len(tic_dat.table) == 3) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [float("nan"), 2, 3], [2, 1, float("nan")]]) # this fails because _same_data isn't smart enough to check against nan in the keys, # because float("nan") != float("nan") self.assertFalse(tdf._same_data(tic_dat3, tic_dat_two)) pdf = PanDatFactory(table=[["a"], ["b", "c"]]) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [2, None, 3], [2, 1, None]]) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [2, float("nan"), 3], [2, 1, float("nan")]]) # _same_data works fine in checking nan equivalence in data rows - which maybe self.assertTrue( tdf._same_data(tic_dat3, tic_dat_two, nans_are_same_for_data_rows=True))
def make_dat(l): tdf = TicDatFactory(**pdf.schema()) return tdf.copy_to_pandas(tdf.TicDat(table=l), drop_pk_columns=False)