def testDateTime(self): tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) tdf.add_parameter("p1", "Dec 15 1970", datetime=True) tdf.add_parameter("p2", None, datetime=True, nullable=True) tdf.set_data_type("table_with_stuffs", "field one", datetime=True) tdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = tdf.TicDat(table_with_stuffs=[[ "July 11 1972", None ], [datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011")]], parameters=[["p1", "7/11/1911"], ["p2", None]]) self.assertFalse( tdf.find_data_type_failures(dat) or tdf.find_data_row_failures(dat)) file_one = os.path.join(_scratchDir, "datetime.json") tdf.json.write_file(dat, file_one) dat_1 = tdf.json.create_tic_dat(file_one) self.assertFalse(tdf._same_data(dat, dat_1)) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], datetime.datetime)) self.assertTrue( all( isinstance(_, datetime.datetime) for _ in dat_1.table_with_stuffs)) self.assertTrue( all( isinstance(_, datetime.datetime) or _ is None for v in dat_1.table_with_stuffs.values() for _ in v.values()))
def test_numericish_text(self): dir_path = os.path.join(_scratchDir, "numericish") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat = tdf.TicDat( parameters=[["a", "100"], ["b", "010"], [3, "200"], ["d", "020"]]) def round_trip(): tdf.csv.write_directory(dat, makeCleanDir(dir_path)) return tdf.csv.create_tic_dat(dir_path) dat2 = round_trip() self.assertFalse(tdf._same_data(dat, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", strings_allowed='*', number_allowed=True) tdf.set_default_value("parameters", "Value", "") dat2 = round_trip() self.assertTrue(tdf._same_data(dat, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Value", strings_allowed='*', number_allowed=False) dat = tdf.TicDat(parameters=[["a", "100"], ["b", "010"], ["c", "200"], ["d", "020"]]) dat2 = round_trip() self.assertTrue(tdf._same_data(dat, dat2))
def testNulls(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) dat = tdf.TicDat(table=[[None, 100], [200, "this"], ["that", 300], [300, None], [400, "that"]]) file_path = os.path.join(_scratchDir, "nulls.accdb") tdf.mdb.write_file(dat, file_path) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat( table=[[float("inf"), 100], [200, "this"], ["that", 300], [300, float("inf")], [400, "that"]]) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf.mdb.write_file(dat_inf, makeCleanPath(file_path)) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertFalse(tdf._same_data(dat_inf, dat_1)) dat_inf = tdf.TicDat( table=[[float("-inf"), 100], [200, "this"], ["that", 300], [300, -float("inf")], [400, "that"]]) self.assertTrue(tdf._same_data(dat_inf, dat_1))
def testSillyCleaningOpalyticsOne(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def test_empty_text_none(self): # this is a naive data scientist who isn't using the parameters functionality filePath = os.path.join(_scratchDir, "empty.xls") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat_n = tdf.TicDat(parameters=[[None, 100], ["b", 10.01], ["three", 200], ["d", None]]) dat_s = tdf.TicDat( parameters=[["", 100], ["b", 10.01], ["three", 200], ["d", ""]]) def round_trip(): tdf.xls.write_file(dat_n, filePath, allow_overwrite=True) return tdf.xls.create_tic_dat(filePath) dat2 = round_trip() self.assertTrue( tdf._same_data(dat_s, dat2) and not tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_default_value( "parameters", "Value", None) # this default alone will mess with number reading dat2 = round_trip() self.assertTrue(not tdf._same_data(dat_s, dat2) and tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters='*') dat = tdf.xls.create_tic_dat(filePath) self.assertTrue(dat.parameters.shape == (4, 2))
def testDietCleaningOpalyticsTwo(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) tdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) ticDat = tdf.copy_tic_dat(dietData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**dietSchema()) addDietForeignKeys(pdf) pdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDietCleaningFive(self): tdf = TicDatFactory(**dietSchema()) tdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) tdf.set_data_type("categories", "minNutrition", max=0, inclusive_max=True) addDietForeignKeys(tdf) ticDat = tdf.copy_tic_dat(dietData()) input_set = create_inputset_mock(tdf, ticDat) self.assertTrue( tdf._same_data( tdf.opalytics.create_tic_dat(input_set, raw_data=True), ticDat)) ticDatPurged = tdf.opalytics.create_tic_dat(input_set, raw_data=False) self.assertFalse(tdf._same_data(ticDatPurged, ticDat)) ticDat.categories.pop("fat") ticDat.categories.pop("calories") ticDat.categories.pop("protein") self.assertFalse(tdf._same_data(ticDatPurged, ticDat)) tdf.remove_foreign_keys_failures(ticDat) self.assertTrue(tdf._same_data(ticDatPurged, ticDat))
def testBooleansAndNulls(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) dat = tdf.TicDat(table=[[None, 100], [200, True], [False, 300], [300, None], [400, False]]) file_one = os.path.join(_scratchDir, "boolDefaults_1.json") file_two = os.path.join(_scratchDir, "boolDefaults_2.json") tdf.json.write_file(dat, file_one, verbose=True) tdf.json.write_file(dat, file_two, verbose=False) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat, dat_1)) self.assertTrue(tdf._same_data(dat, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat( table=[[float("inf"), 100], [200, True], [False, 300], [300, float("inf")], [400, False]]) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf.json.write_file(dat_inf, file_one, verbose=True, allow_overwrite=True) tdf.json.write_file(dat_inf, file_two, verbose=False, allow_overwrite=True) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertFalse(tdf._same_data(dat_inf, dat_1)) self.assertFalse(tdf._same_data(dat_inf, dat_2)) dat_inf = tdf.TicDat( table=[[float("-inf"), 100], [200, True], [False, 300], [300, -float("inf")], [400, False]]) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2))
def testIntHandling(self): if not self.can_run: return tdf = TicDatFactory(boger=[["the"], ["big", "boger"]], moger=[["the", "big"], ["boger"]], woger=[[], ["the", "big", "boger"]]) for t in ["boger", "moger", "woger"]: tdf.set_data_type(t, "big", must_be_int=True) dat = tdf.TicDat(boger={ 1: [1.0, "t"], "b": [12, 11.1], 12.1: [14.0, 15.0] }, moger={ (1, 1.0): "t", ("b", 12): 11.1, (12.1, 14.0): 15.0 }, woger=[(1, 1.0, "t"), ("b", 12, 11.1), (12.1, 14.0, 15.0)]) filePath = os.path.join(_scratchDir, "intHandling.xls") tdf.xls.write_file(dat, filePath) dat2 = tdf.xls.create_tic_dat(filePath) tdf3 = TicDatFactory(boger=[["the"], ["big", "boger"]], moger=[["the", "big"], ["boger"]], woger=[[], ["the", "big", "boger"]]) dat3 = tdf3.xls.create_tic_dat(filePath) self.assertFalse( any(map(tdf.find_data_type_failures, [dat, dat2, dat3]))) self.assertTrue(all(tdf._same_data(dat, _) for _ in [dat2, dat3])) self.assertFalse( all( isinstance(r["big"], int) for r in list(dat.boger.values()) + list(dat.woger))) self.assertTrue( all( isinstance(r["big"], int) for r in list(dat2.boger.values()) + list(dat2.woger))) self.assertFalse( any( isinstance(r["big"], int) for r in list(dat3.boger.values()) + list(dat3.woger))) self.assertTrue( all(isinstance(_.woger[1]["big"], int) for _ in [dat, dat2])) self.assertFalse(all(isinstance(k[-1], int) for k in dat.moger)) self.assertTrue(any(isinstance(k[-1], int) for k in dat.moger)) self.assertTrue(all(isinstance(k[-1], int) for k in dat2.moger)) self.assertFalse(any(isinstance(k[-1], int) for k in dat3.moger))
def testNetflow(self): if not self.can_run: return tdf = TicDatFactory(**netflowSchema()) ticDat = tdf.TicDat( **{t: getattr(netflowData(), t) for t in tdf.primary_key_fields}) self._test_generic_copy(ticDat, tdf) self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"]) dirPath = os.path.join(_scratchDir, "netflow") tdf.csv.write_directory(ticDat, dirPath) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf.csv.find_duplicates(dirPath)) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True, headers_present=False) self.assertFalse(tdf._same_data(ticDat, csvTicDat)) tdf.csv.write_directory(ticDat, dirPath, write_header=False, allow_overwrite=True) self.assertTrue( self.firesException( lambda: tdf.csv.create_tic_dat(dirPath, freeze_it=True))) csvTicDat = tdf.csv.create_tic_dat(dirPath, headers_present=False, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) # the casting to floats is controlled by data types and default values ticDat.nodes[12] = {} tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertFalse(tdf._same_data(ticDat, csvTicDat)) tdf2 = TicDatFactory(**netflowSchema()) tdf2.set_data_type("nodes", "name", strings_allowed='*', number_allowed=True) csvTicDat = tdf2.csv.create_tic_dat(dirPath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat)) del (ticDat.nodes[12]) ticDat.nodes['12'] = {} self.assertTrue( firesException(lambda: tdf.csv.write_directory(ticDat, dirPath))) tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True) csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, csvTicDat))
def testSillyTwoTables(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) tdf.set_data_type("a", "aField", strings_allowed='*', number_allowed=True) ticDat = tdf.TicDat(**sillyMeDataTwoTables()) dirPath = os.path.join(_scratchDir, "sillyTwoTables") tdf.csv.write_directory(ticDat, dirPath) self.assertFalse(tdf.csv.find_duplicates(dirPath)) csvTicDat = tdf.csv.create_tic_dat(dirPath) self.assertTrue(tdf._same_data(ticDat, csvTicDat))
def testDateTimeTwo(self): file = os.path.join(_scratchDir, "datetime_pd.xls") df = utils.pd.DataFrame({ "a": list( map(utils.pd.Timestamp, [ "June 13 1960 4:30PM", "Dec 11 1970 1AM", "Sept 11 2001 9:30AM" ])) }) df.to_excel(file, "Cool Runnings") tdf = TicDatFactory(cool_runnings=[["a"], []]) tdf.set_data_type("cool_runnings", "a", datetime=True) dat = tdf.xls.create_tic_dat(file) self.assertTrue(set(dat.cool_runnings) == set(df["a"]))
def testCreateModText(self): tdf = TicDatFactory( table1=[["string_pk", "num_pk"], ["num_field1", "string_field2"]]) tdf.set_data_type("table1", "num_pk", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) tdf.set_data_type("table1", "string_field2", number_allowed=False, strings_allowed='*') modStr = create_opl_mod_text(tdf) self.assertTrue("key string string_pk;" in modStr) self.assertTrue("key float num_pk;" in modStr) self.assertTrue("float num_field1;" in modStr) self.assertTrue("string string_field2;" in modStr)
def testDateTimeTwo(self): # this is good test for datetime stuff file = os.path.join(_scratchDir, "datetime_pd.xls") df = utils.pd.DataFrame({ "a": list( map(utils.pd.Timestamp, [ "June 13 1960 4:30PM", "Dec 11 1970 1AM", "Sept 11 2001 9:30AM" ])) }) tdf = TicDatFactory(cool_runnings=[["a"], []]) tdf.set_data_type("cool_runnings", "a", datetime=True) df.to_excel(file, "Cool Runnings") dat = tdf.xls.create_tic_dat(file) self.assertTrue(set(dat.cool_runnings) == set(df["a"])) file = file + "x" df.to_excel(file, "Cool Runnings") dat = tdf.xls.create_tic_dat(file) for x, y in zip(sorted(dat.cool_runnings), sorted(set(df["a"]))): delta = x - y self.assertTrue(abs(delta.total_seconds()) < 1e-4)
def test_empty_text_none(self): dir_path = os.path.join(_scratchDir, "empty_text") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat_n = tdf.TicDat(parameters=[[None, 100], ["b", 10.01], ["three", 200], ["d", None]]) dat_s = tdf.TicDat( parameters=[["", 100], ["b", 10.01], ["three", 200], ["d", ""]]) def round_trip(): tdf.csv.write_directory(dat_n, makeCleanDir(dir_path)) return tdf.csv.create_tic_dat(dir_path) dat2 = round_trip() self.assertTrue( tdf._same_data(dat_s, dat2) and not tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_default_value( "parameters", "Value", None) # this default alone will mess with number reading dat2 = round_trip() self.assertFalse( tdf._same_data(dat_s, dat2) or tdf._same_data(dat_n, dat2)) self.assertTrue( any(r["Value"] is None for r in dat2.parameters.values())) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_data_type("parameters", "Value", nullable=True, must_be_int=True) dat2 = round_trip() self.assertTrue(not tdf._same_data(dat_s, dat2) and tdf._same_data(dat_n, dat2))
def testIssue45(self): raw_tdf = TicDatFactory(data=[["a"], ["b"]]) tdf_nums = TicDatFactory(data=[["a"], ["b"]]) tdf_nums.set_data_type("data", "a") tdf_strs = TicDatFactory(data=[["a"], ["b"]]) tdf_strs.set_data_type("data", "b", strings_allowed='*', number_allowed=False) dat_nums = tdf_nums.TicDat(data=[[1, 2], [3, 4], [22, 44]]) dat_strs = tdf_nums.TicDat( data=[["1", "2"], ["3", "4"], ["022", "0044"]]) dirs = [ os.path.join(_scratchDir, _) for _ in ["dat_nums_csv", "dat_strs_csv"] ] raw_tdf.csv.write_directory(dat_nums, dirs[0]) dat_nums_2 = tdf_nums.csv.create_tic_dat(dirs[0]) raw_tdf.csv.write_directory(dat_strs, dirs[1]) dat_strs_2 = tdf_strs.csv.create_tic_dat(dirs[1]) self.assertTrue(raw_tdf._same_data(dat_nums, dat_nums_2)) self.assertTrue(raw_tdf._same_data(dat_strs, dat_strs_2))
def doTest(headersPresent): tdf = TicDatFactory(**sillyMeSchema()) for t, flds in tdf.primary_key_fields.items(): for f in flds: tdf.set_data_type(t, f, number_allowed=True, strings_allowed='*') ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, ) for t in ("a", "b"): schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema5b = sillyMeSchema() for t in ("a", "b"): schema5b[t][1] = _tuple(schema5b[t][0]) + _tuple( schema5b[t][1]) schema5b["a"][0], schema5b["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [("dField", ), []] tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema5b, schema6)) for tdf_ in [tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6]: for t, flds in tdf_.primary_key_fields.items(): for f in flds: tdf_.set_data_type(t, f, number_allowed=True, strings_allowed='*') tdf5.set_generator_tables(["a", "c"]) tdf5b.set_generator_tables(("a", "c")) dirPath = makeCleanDir(os.path.join(_scratchDir, "silly")) tdf.csv.write_directory(ticDat, dirPath, write_header=headersPresent) ticDat2 = tdf2.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertFalse if headersPresent else self.assertTrue)( tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)( tdf._same_data(ticDat, ticDat3)) if headersPresent: ticDat4 = tdf4.csv.create_tic_dat( dirPath, headers_present=headersPresent) for t in ("a", "b"): for k, v in getattr(ticDat4, t).items(): for _k, _v in v.items(): self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]): self.assertTrue(t == "b") else: self.assertTrue(t == "a") else: self.assertTrue( self.firesException(lambda: tdf4.csv.create_tic_dat( dirPath, headers_present=headersPresent))) ticDat5 = tdf5.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)( tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue( callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat5b = tdf5b.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf5b._same_data(tdf._keyless(ticDat), ticDat5b)) self.assertTrue( callable(ticDat5b.a) and callable(ticDat5b.c) and not callable(ticDat5b.b)) ticDat6 = tdf6.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue( firesException(lambda: tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) allDataTdf = TicDatFactory( **{ t: [[], tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())] for t in tdf.all_tables }) def writeData(data): td = allDataTdf.TicDat(a=data, b=data, c=data) allDataTdf.csv.write_directory(td, dirPath, allow_overwrite=True, write_header=headersPresent) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.csv.create_tic_dat(dirPath, headers_present=headersPresent, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40) rowCount = tdf.csv.find_duplicates(dirPath, headers_present=headersPresent) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 20, 30, 12)]) rowCount = tdf.csv.find_duplicates(dirPath, headers_present=headersPresent) self.assertTrue( set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 3) self.assertTrue( set(rowCount["b"]) == {(1, 20, 30)} and rowCount["b"][1, 20, 30] == 2)
def testSilly(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, ) for t in ("a", "b"): schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"], ()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a", "c")) filePath = os.path.join(_scratchDir, "silly.xls") tdf.xls.write_file(ticDat, filePath) ticDat2 = tdf2.xls.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.xls.create_tic_dat(filePath) for t in ["a", "b"]: for k, v in getattr(ticDat4, t).items(): for _k, _v in v.items(): self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]): self.assertTrue(t == "b") else: self.assertTrue(t == "a") ticDat5 = tdf5.xls.create_tic_dat(filePath, treat_inf_as_infinity=False) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue( callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat6 = tdf6.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue( firesException(lambda: tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) def writeData(data, write_header="same"): assert filePath.endswith(".xls") assert not write_header or write_header in ("lower", "same", "duped") import xlwt book = xlwt.Workbook() for t in tdf.all_tables: sheet = book.add_sheet(t) if write_header: all_fields = tdf.primary_key_fields.get( t, ()) + tdf.data_fields.get(t, ()) for i, f in enumerate( (2 if write_header == "duped" else 1) * all_fields): sheet.write( 0, i, f.lower() if write_header == "lower" or i >= len(all_fields) else f) for rowInd, row in enumerate(data): for fieldInd, cellValue in enumerate( (2 if write_header == "duped" else 1) * row): sheet.write(rowInd + (1 if write_header else 0), fieldInd, cellValue) if os.path.exists(filePath): os.remove(filePath) book.save(filePath) if write_header in [ "lower", "same" ]: # will use pandas to generate the xlsx file version file_path_x = filePath + "x" if os.path.exists(file_path_x): os.remove(file_path_x) writer = utils.pd.ExcelWriter(file_path_x) for t, (pks, dfs) in tdf.schema().items(): fields = pks + dfs if write_header == "lower": fields = [_.lower() for _ in fields] d = {f: [] for f in fields} for row in data: for f, c in zip(fields, row): d[f].append(c) utils.pd.DataFrame(d).to_excel(writer, t, index=False) writer.save() writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="duped") self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="lower") ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False) self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.find_duplicates(filePath, headers_present=False) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) ticDat.a["theboger"] = (1, None, 12) tdf.xls.write_file(ticDat, filePath, allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True) # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here. # not sure how to handle this, but documenting for now. self.assertFalse(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "") # the workaround for this flaw is to set the data type to be nullabe but not allow the empty string tdfwa = TicDatFactory(**sillyMeSchema()) tdfwa.set_data_type("a", "aData2", nullable=True) ticDatNone = tdfwa.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) # checking the same thing with .xlsx - using openpyxl, None is indeed recovered even without tdfwa munging! tdf.xls.write_file(ticDat, filePath + "x", allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) ticDatNone = tdfwa.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 20, 30, 12)]) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 3) self.assertTrue( set(rowCount["b"]) == {(1, 20, 30)} and rowCount["b"][1, 20, 30] == 2)
def testEight(self): tdf = TicDatFactory(**dietSchema()) def makeIt() : rtn = tdf.TicDat() rtn.foods["a"] = 12 rtn.foods["b"] = None rtn.categories["1"] = {"maxNutrition":100, "minNutrition":40} rtn.categories["2"] = [10,20] for f, p in itertools.product(rtn.foods, rtn.categories): rtn.nutritionQuantities[f,p] = 5 rtn.nutritionQuantities['a', 2] = 12 return tdf.freeze_me(rtn) dat = makeIt() self.assertFalse(tdf.find_data_type_failures(dat)) tdf = TicDatFactory(**dietSchema()) tdf.set_data_type("foods", "cost", nullable=False) tdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) tdf.set_default_value("foods", "cost", 2) dat = makeIt() failed = tdf.find_data_type_failures(dat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['nutritionQuantities', 'qty'].pks) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(failed['nutritionQuantities', 'qty'].bad_values == (5,)) ex = self.firesException(lambda : tdf.replace_data_type_failures(tdf.copy_tic_dat(dat))) self.assertTrue(all(_ in ex for _ in ("replacement value", "nutritionQuantities", "qty"))) fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(dat), replacement_values={("nutritionQuantities", "qty"):5.001}) self.assertFalse(tdf.find_data_type_failures(fixedDat) or tdf._same_data(fixedDat, dat)) self.assertTrue(all(fixedDat.nutritionQuantities[pk]["qty"] == 5.001 for pk in failed['nutritionQuantities', 'qty'].pks)) self.assertTrue(fixedDat.foods["a"]["cost"] == 12 and fixedDat.foods["b"]["cost"] == 2 and fixedDat.nutritionQuantities['a', 2]["qty"] == 12) tdf = TicDatFactory(**dietSchema()) tdf.set_data_type("foods", "cost", nullable=False) tdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) fixedDat2 = tdf.replace_data_type_failures(tdf.copy_tic_dat(dat), replacement_values={("nutritionQuantities", "qty"):5.001, ("foods", "cost") : 2}) self.assertTrue(tdf._same_data(fixedDat, fixedDat2)) tdf = TicDatFactory(**dietSchema()) tdf.set_data_type("foods", "cost", nullable=True) tdf.set_data_type("nutritionQuantities", "qty",number_allowed=False) failed = tdf.find_data_type_failures(dat) self.assertTrue(set(failed) == {('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['nutritionQuantities', 'qty'].pks) == set(dat.nutritionQuantities)) ex = self.firesException(lambda : tdf.replace_data_type_failures(tdf.copy_tic_dat(dat))) self.assertTrue(all(_ in ex for _ in ("replacement value", "nutritionQuantities", "qty"))) tdf = TicDatFactory(**dietSchema()) tdf.set_data_type("foods", "cost") fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt())) self.assertTrue(fixedDat.foods["a"]["cost"] == 12 and fixedDat.foods["b"]["cost"] == 0) tdf = TicDatFactory(**netflowSchema()) addNetflowForeignKeys(tdf) dat = tdf.copy_tic_dat(netflowData(), freeze_it=1) self.assertFalse(hasattr(dat.nodes["Detroit"], "arcs_source")) tdf = TicDatFactory(**netflowSchema()) addNetflowForeignKeys(tdf) tdf.enable_foreign_key_links() dat = tdf.copy_tic_dat(netflowData(), freeze_it=1) self.assertTrue(hasattr(dat.nodes["Detroit"], "arcs_source")) tdf = TicDatFactory(**netflowSchema()) def makeIt() : if not tdf.foreign_keys: tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) orig = netflowData() rtn = tdf.copy_tic_dat(orig) for n in rtn.nodes["Detroit"].arcs_source: rtn.arcs["Detroit", n] = n self.assertTrue(all(len(getattr(rtn, t)) == len(getattr(orig, t)) for t in tdf.all_tables)) return tdf.freeze_me(rtn) dat = makeIt() self.assertFalse(tdf.find_data_type_failures(dat)) tdf = TicDatFactory(**netflowSchema()) tdf.set_data_type("arcs", "capacity", strings_allowed="*") dat = makeIt() self.assertFalse(tdf.find_data_type_failures(dat)) tdf = TicDatFactory(**netflowSchema()) tdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) dat = makeIt() failed = tdf.find_data_type_failures(dat) self.assertTrue(failed == {('arcs', 'capacity'):(("New York",), (("Detroit", "New York"),))}) fixedDat = tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt())) netflowData_ = tdf.copy_tic_dat(netflowData()) self.assertFalse(tdf.find_data_type_failures(fixedDat) or tdf._same_data(dat, netflowData_)) fixedDat = tdf.copy_tic_dat(tdf.replace_data_type_failures(tdf.copy_tic_dat(makeIt()), {("arcs", "capacity"):80, ("cost","cost") :"imok"})) fixedDat.arcs["Detroit", "Boston"] = 100 fixedDat.arcs["Detroit", "Seattle"] = 120 self.assertTrue(tdf._same_data(fixedDat, netflowData_))
def testReadModText(self): tdf1 = TicDatFactory(test_1=[["sf1"], ["sf2", "nf1", "nf2"]]) tdf1.set_data_type("test_1", "sf2", number_allowed=False, strings_allowed='*') test_str = 'test_1 = {<"s1" "s2" 1 2> <"s3" "s4" 0 0>}' test_dat = read_opl_text(tdf1, test_str, False) self.assertTrue(test_dat.test_1["s1"]["sf2"] == "s2") self.assertTrue(test_dat.test_1["s1"]["nf2"] == 2) self.assertTrue(test_dat.test_1["s2"]["nf1"] == 0) tdf2 = TicDatFactory(test_2=[["sf1"], []]) test_str = 'test_2 = {<"s3">}' test_dat = read_opl_text(tdf2, test_str, False) self.assertTrue(list(test_dat.test_2.keys())[0] == "s3") tdf3 = TicDatFactory(test_3=[["nf1"], []]) tdf3.set_data_type("test_3", "nf1", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) test_str = 'test_3 = {<6> <5>}' test_dat = read_opl_text(tdf3, test_str, False) self.assertTrue(6 in test_dat.test_3.keys()) self.assertTrue(5 in test_dat.test_3.keys()) self.assertTrue(len(test_dat.test_3.keys()) == 2) tdf4 = TicDatFactory(test_4=[["nf1"], ["nf2", "nf3", "nf4"]]) tdf4.set_data_type("test_4", "nf1", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) tdf4.set_data_type("test_4", "nf2", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) tdf4.set_data_type("test_4", "nf3", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) tdf4.set_data_type("test_4", "nf4", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) test_str = 'test_4 = {<7 0 809 9>}' test_dat = read_opl_text(tdf4, test_str, False) self.assertTrue(7 in test_dat.test_4.keys()) self.assertTrue(len(test_dat.test_4[7]) == 3) self.assertTrue(test_dat.test_4[7]["nf3"] == 809) tdf5 = TicDatFactory(test_5=[["sf1"], ["sf2"]]) tdf5.set_data_type("test_5", "sf2", number_allowed=False, strings_allowed='*') test_str = 'test_5 = {<"s4" "s5">}' test_dat = read_opl_text(tdf5, test_str, False) self.assertTrue("s4" in test_dat.test_5.keys()) self.assertTrue(test_dat.test_5["s4"]["sf2"] == "s5") tdf6 = TicDatFactory(test_6=[["nf1"], ["sf1"]]) tdf6.set_data_type("test_6", "nf1", min=0, max=float("inf"), inclusive_min=True, inclusive_max=False) tdf6.set_data_type("test_6", "sf1", number_allowed=False, strings_allowed='*') test_str = 'test_6 = {<0 "s6">}' test_dat = read_opl_text(tdf6, test_str, False) self.assertTrue(0 in test_dat.test_6.keys()) self.assertTrue(test_dat.test_6[0.0]['sf1'] == "s6")
def testDiet(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) tdf.enable_foreign_key_links() oldDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) self._test_generic_free_copy(oldDat, tdf) self._test_generic_free_copy(oldDat, tdf, ["nutritionQuantities"]) ticDat = tdf.copy_to_pandas(oldDat) for k in oldDat.foods: self.assertTrue(oldDat.foods[k]["cost"] == ticDat.foods.cost[k]) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == ticDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == ticDat.nutritionQuantities.qty[k1, k2]) nut = ticDat.nutritionQuantities self.assertTrue(firesException(lambda: nut.qty.loc[:, "fatty"])) self.assertTrue(firesException(lambda: nut.qty.loc["chickeny", :])) self.assertFalse(firesException(lambda: nut.qty.sloc[:, "fatty"])) self.assertFalse(firesException(lambda: nut.qty.sloc["chickeny", :])) self.assertTrue(0 == sum(nut.qty.sloc[:, "fatty"]) == sum(nut.qty.sloc[ "chickeny", :])) self.assertTrue( sum(nut.qty.sloc[:, "fat"]) == sum(nut.qty.loc[:, "fat"]) == sum( r["qty"] for (f, c), r in oldDat.nutritionQuantities.items() if c == "fat")) self.assertTrue( sum(nut.qty.sloc["chicken", :]) == sum(nut.qty.loc["chicken", :]) == sum(r["qty"] for (f, c), r in oldDat.nutritionQuantities.items() if f == "chicken")) rebornTicDat = tdf.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) tdf2 = TicDatFactory(**{t: '*' for t in tdf.all_tables}) self.assertTrue( firesException( lambda: tdf2.set_data_type("nutritionQuantities", "qty"))) genTicDat = tdf2.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == genTicDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == genTicDat.nutritionQuantities.qty[k1, k2]) self.assertFalse(tdf.good_tic_dat_object(genTicDat)) self.assertTrue(tdf2.good_tic_dat_object(genTicDat)) rebornTicDat = tdf.TicDat( **{t: getattr(genTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) rebornGenTicDat = tdf2.TicDat(**tdf2.as_dict(genTicDat)) for t, pks in tdf.primary_key_fields.items(): getattr(rebornGenTicDat, t).index.names = pks rebornTicDat = tdf.TicDat( **{t: getattr(rebornGenTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) tdf3 = TicDatFactory(**dict(dietSchema(), **{"categories": '*'})) self.assertFalse( firesException( lambda: tdf3.set_data_type("nutritionQuantities", "qty"))) mixTicDat = tdf3.TicDat( **{t: getattr(ticDat, t) for t in tdf.all_tables}) for k in oldDat.categories: self.assertTrue(oldDat.categories[k]["minNutrition"] == mixTicDat.categories.minNutrition[k]) for k1, k2 in oldDat.nutritionQuantities: self.assertTrue(oldDat.nutritionQuantities[k1, k2]["qty"] == mixTicDat.nutritionQuantities[k1, k2]["qty"]) self.assertFalse(tdf2.good_tic_dat_object(mixTicDat)) self.assertFalse(tdf3.good_tic_dat_object(genTicDat)) self.assertTrue(tdf3.good_tic_dat_object(mixTicDat)) rebornTicDat = tdf.TicDat( **{t: getattr(mixTicDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat))