def _test_generic_copy(self, ticDat, tdf, skip_tables=None): assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables) path = makeCleanDir(os.path.join(_scratchDir, "generic_copy")) replace_name = lambda f: "name_" if f == "name" else f clean_tdf = TicDatFactory( **{ t: [list(map(replace_name, pks)), dfs] for t, (pks, dfs) in tdf.schema().items() }) temp_tdf = TicDatFactory( **{ t: v if t in (skip_tables or []) else '*' for t, v in clean_tdf.schema().items() }) temp_dat = temp_tdf.TicDat( **{t: getattr(ticDat, t) for t in (skip_tables or [])}) for t in temp_tdf.generic_tables: setattr( temp_dat, t, getattr( clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False), t)) temp_tdf.sql.write_db_data(temp_dat, os.path.join(path, "f.db")) temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f1.sql"), include_schema=False) temp_tdf.sql.write_sql_file(temp_dat, os.path.join(path, "f2.sql"), include_schema=True) for file_name, includes_schema in [("f.db", False), ("f1.sql", False), ("f2.sql", True)]: file_path = os.path.join(path, file_name) if file_path.endswith(".db"): self.assertFalse(temp_tdf.sql.find_duplicates(file_path)) read_dat = temp_tdf.sql.create_tic_dat(file_path) else: read_dat = temp_tdf.sql.create_tic_dat_from_sql( file_path, includes_schema) generic_free_dat, _ = utils.create_generic_free(read_dat, temp_tdf) check_dat = clean_tdf.TicDat() for t in temp_tdf.generic_tables: for r in getattr(generic_free_dat, t): pks = clean_tdf.primary_key_fields[t] getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \ {df:r[df] for df in clean_tdf.data_fields.get(t, [])} for t in (skip_tables or []): for k, v in getattr(generic_free_dat, t).items(): getattr(check_dat, t)[k] = v self.assertTrue( clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
def testDiet(self): if not self.can_run: return for verbose in [True, False]: tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat(**{ t: getattr(dietData(), t) for t in tdf.primary_key_fields })) writePath = os.path.join( makeCleanDir(os.path.join(_scratchDir, "diet")), "file.json") tdf.json.write_file(ticDat, writePath, verbose=verbose) self.assertFalse(tdf.json.find_duplicates(writePath)) jsonTicDat = tdf.json.create_tic_dat(writePath) self.assertTrue(tdf._same_data(ticDat, jsonTicDat)) def change(): jsonTicDat.categories["calories"]["minNutrition"] = 12 self.assertFalse(firesException(change)) self.assertFalse(tdf._same_data(ticDat, jsonTicDat)) jsonTicDat = tdf.json.create_tic_dat(writePath, freeze_it=True) self.assertTrue(firesException(change)) self.assertTrue(tdf._same_data(ticDat, jsonTicDat)) tdf2 = TicDatFactory(**dietSchemaWeirdCase()) dat2 = copyDataDietWeirdCase(ticDat) tdf2.json.write_file(dat2, writePath, allow_overwrite=True, verbose=verbose) jsonTicDat2 = tdf.json.create_tic_dat(writePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, jsonTicDat2)) tdf3 = TicDatFactory(**dietSchemaWeirdCase2()) dat3 = copyDataDietWeirdCase2(ticDat) tdf3.json.write_file(dat3, writePath, allow_overwrite=True, verbose=verbose) with open(writePath, "r") as f: jdict = json.load(f) jdict["nutrition quantities"] = jdict["nutrition_quantities"] del (jdict["nutrition_quantities"]) with open(writePath, "w") as f: json.dump(jdict, f) jsonDat3 = tdf3.json.create_tic_dat(writePath) self.assertTrue(tdf3._same_data(dat3, jsonDat3)) jdict["nutrition_quantities"] = jdict["nutrition quantities"] with open(writePath, "w") as f: json.dump(jdict, f) self.assertTrue( self.firesException(lambda: tdf3.json.create_tic_dat(writePath)))
def testSilly(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) tdf5 = tdf5.clone() filePath = os.path.join(_scratchDir, "silly.db") tdf.sql.write_db_data(ticDat, filePath) self.assertFalse(tdf.sql.find_duplicates(filePath)) ticDat2 = tdf2.sql.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.sql.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.sql.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) self.assertTrue("table d" in self.firesException(lambda : tdf6.sql.create_tic_dat(filePath))) ticDat.a["theboger"] = (1, None, 12) if am_on_windows: filePath = filePath.replace("silly.db", "silly_2.db") # working around issue opalytics/opalytics-ticdat#153 tdf.sql.write_db_data(ticDat, makeCleanPath(filePath)) ticDatNone = tdf.sql.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
def testMissingTable(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData())) inputset = create_inputset_mock(tdf, ticDat) tdf2 = TicDatFactory(**(dict(dietSchema(), missing_table=[["a"],["b"]]))) ticDat2 = tdf2.opalytics.create_tic_dat(inputset) self.assertTrue(tdf._same_data(ticDat, ticDat2)) self.assertFalse(ticDat2.missing_table)
def testDups(self): if not self.can_run: return for hack, raw_data in list(product(*(([True, False],)*2))): tdf = TicDatFactory(one = [["a"],["b", "c"]], two = [["a", "b"],["c"]], three = [["a", "b", "c"],[]]) tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables}) dups = tdf.opalytics.find_duplicates(create_inputset_mock(tdf2, td, hack), raw_data=raw_data) self.assertTrue(dups == {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
def test_missing_tables(self): path = os.path.join(_scratchDir, "missing") tdf_1 = TicDatFactory(this=[["Something"], ["Another"]]) tdf_2 = TicDatFactory( **dict(tdf_1.schema(), that=[["What", "Ever"], []])) dat = tdf_1.TicDat(this=[["a", 2], ["b", 3], ["c", 5]]) tdf_1.sql.write_sql_file(dat, path + ".sql") sql_dat = tdf_2.sql.create_tic_dat_from_sql(path + ".sql") self.assertTrue(tdf_1._same_data(dat, sql_dat)) tdf_1.sql.write_db_data(dat, path + ".db") sql_dat = tdf_2.sql.create_tic_dat(path + ".db") self.assertTrue(tdf_1._same_data(dat, sql_dat))
def testBooleansAndNulls(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) dat = tdf.TicDat(table=[[None, 100], [200, True], [False, 300], [300, None], [400, False]]) file_one = os.path.join(_scratchDir, "boolDefaults_1.json") file_two = os.path.join(_scratchDir, "boolDefaults_2.json") tdf.json.write_file(dat, file_one, verbose=True) tdf.json.write_file(dat, file_two, verbose=False) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat, dat_1)) self.assertTrue(tdf._same_data(dat, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat( table=[[float("inf"), 100], [200, True], [False, 300], [300, float("inf")], [400, False]]) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf.json.write_file(dat_inf, file_one, verbose=True, allow_overwrite=True) tdf.json.write_file(dat_inf, file_two, verbose=False, allow_overwrite=True) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertFalse(tdf._same_data(dat_inf, dat_1)) self.assertFalse(tdf._same_data(dat_inf, dat_2)) dat_inf = tdf.TicDat( table=[[float("-inf"), 100], [200, True], [False, 300], [300, -float("inf")], [400, False]]) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2))
def testDataTypes(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [10,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 ticdat.nutritionQuantities['a', 2] = 12 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**dietSchema()) pdf.set_data_type("foods", "cost", nullable=False) pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) failed = pdf.find_data_type_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed="*") self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
def testDups(self): if not self.can_run: return tdf = TicDatFactory(one = [["a"],["b", "c"]], two = [["a", "b"],["c"]], three = [["a", "b", "c"],[]]) tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables}) f = makeCleanPath(os.path.join(_scratchDir, "testDups.db")) tdf2.sql.write_db_data(td, f) dups = tdf.sql.find_duplicates(f) self.assertTrue(dups == {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
def testIntHandling(self): if not self.can_run: return tdf = TicDatFactory(boger=[["the"], ["big", "boger"]], moger=[["the", "big"], ["boger"]], woger=[[], ["the", "big", "boger"]]) for t in ["boger", "moger", "woger"]: tdf.set_data_type(t, "big", must_be_int=True) dat = tdf.TicDat(boger={ 1: [1.0, "t"], "b": [12, 11.1], 12.1: [14.0, 15.0] }, moger={ (1, 1.0): "t", ("b", 12): 11.1, (12.1, 14.0): 15.0 }, woger=[(1, 1.0, "t"), ("b", 12, 11.1), (12.1, 14.0, 15.0)]) filePath = os.path.join(_scratchDir, "intHandling.xls") tdf.xls.write_file(dat, filePath) dat2 = tdf.xls.create_tic_dat(filePath) tdf3 = TicDatFactory(boger=[["the"], ["big", "boger"]], moger=[["the", "big"], ["boger"]], woger=[[], ["the", "big", "boger"]]) dat3 = tdf3.xls.create_tic_dat(filePath) self.assertFalse( any(map(tdf.find_data_type_failures, [dat, dat2, dat3]))) self.assertTrue(all(tdf._same_data(dat, _) for _ in [dat2, dat3])) self.assertFalse( all( isinstance(r["big"], int) for r in list(dat.boger.values()) + list(dat.woger))) self.assertTrue( all( isinstance(r["big"], int) for r in list(dat2.boger.values()) + list(dat2.woger))) self.assertFalse( any( isinstance(r["big"], int) for r in list(dat3.boger.values()) + list(dat3.woger))) self.assertTrue( all(isinstance(_.woger[1]["big"], int) for _ in [dat, dat2])) self.assertFalse(all(isinstance(k[-1], int) for k in dat.moger)) self.assertTrue(any(isinstance(k[-1], int) for k in dat.moger)) self.assertTrue(all(isinstance(k[-1], int) for k in dat2.moger)) self.assertFalse(any(isinstance(k[-1], int) for k in dat3.moger))
def testSqlSpaceyTwo(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".db" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path=None, con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path="", con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(None, con) self.assertTrue(pdf._same_data(panDat, panDat2))
def testDietWithInfFlagging(self): tdf = TicDatFactory(**dietSchema()) dat = tdf.copy_tic_dat(dietData()) tdf.set_infinity_io_flag(999999999) path = os.path.join(_scratchDir, "dietInfFlag") tdf.csv.write_directory(dat, path) dat_1 = tdf.csv.create_tic_dat(path) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = tdf.clone() dat_1 = tdf.csv.create_tic_dat(path) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = TicDatFactory(**dietSchema()) dat_1 = tdf.csv.create_tic_dat(path) self.assertFalse(tdf._same_data(dat, dat_1))
def testNetflow(self): if not self.can_run: return tdf = TicDatFactory(**netflowSchema()) ticDat = tdf.TicDat( **{t: getattr(netflowData(), t) for t in tdf.primary_key_fields}) self._test_generic_copy(ticDat, tdf) self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"]) dirPath = os.path.join(_scratchDir, "netflow") tdf.csv.write_directory(ticDat, dirPath) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf.csv.find_duplicates(dirPath)) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True, headers_present=False) self.assertFalse(tdf._same_data(ticDat, csvTicDat)) tdf.csv.write_directory(ticDat, dirPath, write_header=False, allow_overwrite=True) self.assertTrue( self.firesException( lambda: tdf.csv.create_tic_dat(dirPath, freeze_it=True))) csvTicDat = tdf.csv.create_tic_dat(dirPath, headers_present=False, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) # the casting to floats is controlled by data types and default values ticDat.nodes[12] = {} tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf._same_data(ticDat, csvTicDat)) tdf2 = TicDatFactory(**netflowSchema()) tdf2.set_data_type("nodes", "name", strings_allowed='*', number_allowed=True) csvTicDat = tdf2.csv.create_tic_dat(dirPath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) del (ticDat.nodes[12]) ticDat.nodes['12'] = {} self.assertTrue( firesException(lambda: tdf.csv.write_directory(ticDat, dirPath))) tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat))
def testXlsSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.xlsx") pdf.xls.write_file(panDat, filePath) xlsPanDat = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) pdf_shrunk = PanDatFactory(**{ k: v for k, v in dietSchema().items() if k != "nutritionQuantities" }) self.assertTrue(len(pdf_shrunk.all_tables) == len(pdf.all_tables) - 1) xlsPanDatShrunk = pdf_shrunk.xls.create_pan_dat(filePath) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) filePathShrunk = os.path.join(_scratchDir, "diet_shrunk.xlsx") self.assertTrue( self.firesException( lambda: pdf.xls.create_pan_dat(filePathShrunk))) pdf_shrunk.xls.write_file(panDat, filePathShrunk) xlsPanDatShrunk = pdf.xls.create_pan_dat(filePathShrunk) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.xls.write_file(panDat, filePath) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.xlsx") pdf.xls.write_file(panDat, filePath) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat))
def testDups(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(one = [["a"],["b, c"]], two = [["a", "b"],["c"]], three = [["a", "b", "c"],[]]) tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], [11, 1, 2]] for t in tdf.all_tables}) f = makeCleanPath(os.path.join(_scratchDir, "testDups.accdb")) tdf2.mdb.write_file(td, f) #shutil.copy(f, "dups.accdb") #uncomment to make readonly test file as .accdb dups = tdf.mdb.find_duplicates(f) self.assertTrue(dups == {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
def testCsvSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "netflow_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.csv.write_directory(panDat, dirPath) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath, decimal=",") panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertFalse(pdf._same_data(panDat, panDat2)) panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",") self.assertTrue(pdf._same_data(panDat, panDat2))
def testSortedTables(self): test1 = TicDatFactory(table3=[["PK3", "FK1", "FK2"], ["Val D"]], table2=[["PK2"], ["Val A", "Val B"]], table1=[["PK1"], ["Val C"]]) test1.add_foreign_key("table3", "table1", ["FK1", "PK1"]) test1.add_foreign_key("table3", "table2", ["FK2", "PK2"]) self.assertTrue(tlingo._sorted_tables(test1)[-1] == 'table3')
def testSimplest(self): if not _can_unit_test: return tdf = TicDatFactory(simple_table=[["pk1"], ["df1", "df2"]]) dat = tdf.mdb.create_tic_dat("simplest.accdb") self.assertTrue( len(dat.simple_table) == 3 and dat.simple_table[3]["df2"] == 2)
def testSpacey(self): if not _can_unit_test: return tdf = TicDatFactory(**spacesSchema()) spacesData = { "a_table": { 1: { "a Data 3": 3, "a Data 2": 2, "a Data 1": 1 }, 22: (1.1, 12, 12), 0.23: (11, 12, 11) }, "b_table": { ("1", "2", "3"): 1, ("a", "b", "b"): 12 }, "c_table": (("1", "2", "3", 4), { "c Data 4": 55, "c Data 2": "b", "c Data 3": "c", "c Data 1": "a" }, ("a", "b", "12", 24)) } dat = tdf.TicDat(**spacesData) filePath = "spaces.accdb" self.assertFalse(tdf.mdb.find_duplicates(filePath)) dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat, dat2))
def testDiet(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) filePath = makeCleanPath(os.path.join(_scratchDir, "diet.accdb")) tdf.mdb.write_file(ticDat, filePath) #shutil.copy(filePath, "diet.accdb") #uncomment to make readonly test file as .accdb self.assertFalse(tdf.mdb.find_duplicates(filePath)) accdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) def changeit(): accdbTicDat.categories["calories"]["minNutrition"] = 12 changeit() self.assertFalse(tdf._same_data(ticDat, accdbTicDat)) self.assertTrue( self.firesException(lambda: tdf.mdb.write_file(ticDat, filePath))) tdf.mdb.write_file(ticDat, filePath, allow_overwrite=True) accdbTicDat = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) self.assertTrue(self.firesException(changeit)) self.assertTrue(tdf._same_data(ticDat, accdbTicDat))
def testSpacey(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(**spacesSchema()) spacesData = { "a_table" : {1 : {"a Data 3":3, "a Data 2":2, "a Data 1":1}, 22 : (1.1, 12, 12), 0.23 : (11, 12, 11)}, "b_table" : {("1", "2", "3") : 1, ("a", "b", "b") : 12}, "c_table" : (("1", "2", "3", 4), {"c Data 4":55, "c Data 2":"b", "c Data 3":"c", "c Data 1":"a"}, ("a", "b", "12", 24) ) } dat = tdf.TicDat(**spacesData) filePath = makeCleanPath(os.path.join(_scratchDir, "spacey.accdb")) tdf.mdb.write_schema(filePath, a_table = {"a Field":"double"}, c_table = {"c Data 1":"text", "c Data 2":"text", "c Data 3":"text", "c Data 4":"int"}) tdf.mdb.write_file(dat, filePath) self.assertFalse(tdf.mdb.find_duplicates(filePath)) dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat,dat2)) with py.connect(_connection_str(filePath)) as con: for t in tdf.all_tables: con.cursor().execute("SELECT * INTO [%s] FROM %s"%(t.replace("_", " "), t)).commit() con.cursor().execute("DROP TABLE %s"%t).commit() #shutil.copy(filePath, "spaces.accdb") #uncomment to make readonly test file as .accdb dat3 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat, dat3))
def testNetflow(self): if not self.canRun: return tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) self._test_generic_free_copy(oldDat, tdf) self._test_generic_free_copy(oldDat, tdf, ["arcs", "nodes"]) ticDat = tdf.copy_to_pandas(oldDat, ["arcs", "cost"]) self.assertTrue(all(hasattr(ticDat, t) == (t in ["arcs", "cost"]) for t in tdf.all_tables)) self.assertTrue(len(ticDat.arcs.capacity.sloc["Boston",:]) == len(oldDat.nodes["Boston"].arcs_source) == 0) self.assertTrue(len(ticDat.arcs.capacity.sloc[:,"Boston"]) == len(oldDat.nodes["Boston"].arcs_destination) == 2) self.assertTrue(all(ticDat.arcs.capacity.sloc[:,"Boston"][src] == r["capacity"] for src, r in oldDat.nodes["Boston"].arcs_destination.items())) ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=True) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) # because we have single pk field tables, dropping the pk columns is probelmatic self.assertFalse(tdf._same_data(rebornTicDat, oldDat)) # but with the default argument all is well ticDat = tdf.copy_to_pandas(oldDat) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) self.assertTrue(set(ticDat.inflow.columns) == {"quantity"}) self.assertTrue(set(ticDat.nodes.columns) == {"name"})
def testCaseSpaceTableNames(self): tdf = TicDatFactory(table_one=[["a"], ["b", "c"]], table_two=[["this", "that"], []]) dir_path = os.path.join(_scratchDir, "case_space") dat = tdf.TicDat(table_one=[['a', 2, 3], ['b', 5, 6]], table_two=[["a", "b"], ["c", "d"], ["x", "z"]]) tdf.csv.write_directory(dat, makeCleanDir(dir_path), case_space_table_names=True) self.assertTrue( all( os.path.exists(os.path.join(dir_path, _ + ".csv")) for _ in ["Table One", "Table Two"])) self.assertFalse( any( os.path.exists(os.path.join(dir_path, _ + ".csv")) for _ in ["table_one", "table_two"])) self.assertTrue(tdf._same_data(dat, tdf.csv.create_tic_dat(dir_path))) tdf.csv.write_directory(dat, makeCleanDir(dir_path), case_space_table_names=False) self.assertFalse( any( os.path.exists(os.path.join(dir_path, _ + ".csv")) for _ in ["Table One", "Table Two"])) self.assertTrue( all( os.path.exists(os.path.join(dir_path, _ + ".csv")) for _ in ["table_one", "table_two"])) self.assertTrue(tdf._same_data(dat, tdf.csv.create_tic_dat(dir_path)))
def testDietCleaningOpalytisThree(self): tdf = TicDatFactory(**dietSchema()) tdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(tdf) ticDat = tdf.copy_tic_dat(dietData()) pdf = PanDatFactory(**tdf.schema()) pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(pdf) input_set = create_inputset_mock(tdf, ticDat) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDietWithInfFlagging(self): diet_pdf = PanDatFactory(**dietSchema()) addDietDataTypes(diet_pdf) tdf = TicDatFactory(**dietSchema()) dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()), drop_pk_columns=False) diet_pdf.set_infinity_io_flag(999999999) core_path = os.path.join(_scratchDir, "diet_with_inf_flagging") diet_pdf.sql.write_file(dat, core_path + ".db") diet_pdf.csv.write_directory(dat, core_path + "_csv") diet_pdf.json.write_file(dat, core_path + ".json") diet_pdf.xls.write_file(dat, core_path + ".xlsx") for attr, f in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: dat_1 = getattr(diet_pdf, attr).create_pan_dat(f) self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = diet_pdf.clone() dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5)) pdf = PanDatFactory(**diet_pdf.schema()) dat_1 = getattr(pdf, attr).create_pan_dat(f) self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5)) protein = dat_1.categories["name"] == "protein" self.assertTrue( list(dat_1.categories[protein]["maxNutrition"])[0] == 999999999) dat_1.categories.loc[protein, "maxNutrition"] = float("inf") self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
def testDenormalizedErrors(self): if not self.canRun: return c = clean_denormalization_errors f = utils.find_denormalized_sub_table_failures tdf = TicDatFactory(**spacesSchema()) dat = tdf.TicDat(**spacesData()) p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).b_table self.assertFalse(f(p(),"b Field 1",("b Field 2", "b Field 3"))) dat.b_table[2,2,3] = "boger" self.assertFalse(f(p(), "b Field 1",("b Field 2", "b Field 3"))) chk = f(p(), "b Field 2",("b Field 1", "b Field 3")) self.assertTrue(c(chk) == {2: {'b Field 1': {1, 2}}}) dat.b_table[2,2,4] = "boger" dat.b_table[1,'b','b'] = "boger" chk = f(p(), ["b Field 2"],("b Field 1", "b Field 3", "b Data")) self.assertTrue(c(chk) == c({2: {'b Field 3': (3, 4), 'b Data': (1, 'boger'), 'b Field 1': (1, 2)}, 'b': {'b Data': ('boger', 12), 'b Field 1': ('a', 1)}})) ex = self.firesException(lambda : f(p(), ["b Data"],"wtf")) self.assertTrue("wtf isn't a column" in ex) p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).c_table chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"]) self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}}}) dat.c_table.append((1, 2, 3, 4)) dat.c_table.append((1, 2, 1, 4)) dat.c_table.append((1, 2, 1, 5)) dat.c_table.append((1, 2, 3, 6)) chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"]) self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}}, (1,2):{'c Data 3':{3,1}, 'c Data 4':{4,5,6}}})
def testSillyCleaningOpalyticsOne(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDateTime(self): tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) tdf.add_parameter("p1", "Dec 15 1970", datetime=True) tdf.add_parameter("p2", None, datetime=True, nullable=True) tdf.set_data_type("table_with_stuffs", "field one", datetime=True) tdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = tdf.TicDat(table_with_stuffs=[[ "July 11 1972", None ], [datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011")]], parameters=[["p1", "7/11/1911"], ["p2", None]]) self.assertFalse( tdf.find_data_type_failures(dat) or tdf.find_data_row_failures(dat)) file_one = os.path.join(_scratchDir, "datetime.json") tdf.json.write_file(dat, file_one) dat_1 = tdf.json.create_tic_dat(file_one) self.assertFalse(tdf._same_data(dat, dat_1)) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], datetime.datetime)) self.assertTrue( all( isinstance(_, datetime.datetime) for _ in dat_1.table_with_stuffs)) self.assertTrue( all( isinstance(_, datetime.datetime) or _ is None for v in dat_1.table_with_stuffs.values() for _ in v.values()))
def testColumnsWithoutData(self): tdf = TicDatFactory(data=[["a"], ["b"]]) for x in ["", "x"]: file = os.path.join(_scratchDir, "no_data.xls" + x) tdf.xls.write_file(tdf.TicDat(), file) dat = tdf.xls.create_tic_dat(file) self.assertFalse(dat._len_dict())
def testNetflow(self): if not self.can_run: return tdf = TicDatFactory(**netflowSchema()) ticDat = tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields}) self._test_generic_copy(ticDat, tdf) self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"]) dirPath = os.path.join(_scratchDir, "netflow") tdf.csv.write_directory(ticDat, dirPath) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf.csv.find_duplicates(dirPath)) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it= True, headers_present=False) self.assertFalse(tdf._same_data(ticDat, csvTicDat)) tdf.csv.write_directory(ticDat, dirPath, write_header=False,allow_overwrite=True) self.assertTrue(self.firesException(lambda : tdf.csv.create_tic_dat(dirPath, freeze_it=True))) csvTicDat = tdf.csv.create_tic_dat(dirPath, headers_present=False, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) ticDat.nodes[12] = {} tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) # minor flaw - strings that are floatable get turned into floats when reading csvs del(ticDat.nodes[12]) ticDat.nodes['12'] = {} self.assertTrue(firesException(lambda : tdf.csv.write_directory(ticDat, dirPath))) tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf._same_data(ticDat, csvTicDat))