def test_time_stamp(self): tdf = TicDatFactory(table=[["Blah"], ["Timed Info"]]) tdf.set_data_type("table", "Timed Info", nullable=True) tdf.set_default_value("table", "Timed Info", None) dat = tdf.TicDat() dat.table[1] = dateutil.parser.parse("2014-05-01 18:47:05.069722") dat.table[2] = dateutil.parser.parse("2014-05-02 18:47:05.178768") pgtf = tdf.pgsql pgtf.write_schema(self.engine, test_schema, forced_field_types={ ('table', 'Blah'): "integer", ('table', 'Timed Info'): "timestamp" }) pgtf.write_data(dat, self.engine, test_schema, dsn=self.postgresql.dsn()) dat_2 = pgtf.create_tic_dat(self.engine, test_schema) self.assertTrue(tdf._same_data(dat, dat_2)) self.assertTrue( all( isinstance(row["Timed Info"], datetime.datetime) for row in dat_2.table.values())) self.assertFalse( any(isinstance(k, datetime.datetime) for k in dat_2.table)) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) def same_data(pan_dat, pan_dat_2): df1, df2 = pan_dat.table, pan_dat_2.table if list(df1["Blah"]) != list(df2["Blah"]): return False for dt1, dt2 in zip(df1["Timed Info"], df2["Timed Info"]): delta = dt1 - dt2 if abs(delta.total_seconds()) > 1e-6: return False return True pan_dat = pdf.pgsql.create_pan_dat(self.engine, test_schema) pan_dat_2 = pan_dat_maker(tdf.schema(), dat_2) self.assertTrue(same_data(pan_dat, pan_dat_2)) for df in [_.table for _ in [pan_dat, pan_dat_2]]: for i in range(len(df)): self.assertFalse( isinstance(df.loc[i, "Blah"], datetime.datetime)) self.assertTrue( isinstance(df.loc[i, "Timed Info"], datetime.datetime)) pan_dat.table.loc[1, "Timed Info"] = dateutil.parser.parse( "2014-05-02 18:48:05.178768") self.assertFalse(same_data(pan_dat, pan_dat_2)) pdf.pgsql.write_data(pan_dat, self.engine, test_schema) pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine, test_schema) self.assertTrue(same_data(pan_dat, pan_dat_2)) dat.table[2] = dateutil.parser.parse("2014-05-02 18:48:05.178768") self.assertFalse(tdf._same_data(dat, dat_2))
def test_pgtd_active_dups(self): if not self.can_run: return schema = test_schema + "_act_dups" tdf_1 = TicDatFactory(t_one=[[], ["Field One", "Field Two", "Da Active"]], t_two=[[], ["Field One", "Da Active"]]) dat = tdf_1.TicDat(t_one=[["a", "b", True], ["a", "c", True], ["a", "b", False], ["a", "d", True]], t_two=[["a", True], ["b", False], ["a", False], ["b", False], ["a", False]]) self.assertTrue(len(dat.t_one) == 4 and len(dat.t_two) == 5) tdf_1.pgsql.write_schema( self.engine, schema, include_ancillary_info=False, forced_field_types={(t, f): "boolean" if "Active" in f else "text" for t, (pks, dfs) in tdf_1.schema().items() for f in pks + dfs}) tdf_1.pgsql.write_data(dat, self.engine, schema) self.assertTrue( tdf_1._same_data(dat, tdf_1.pgsql.create_tic_dat(self.engine, schema), epsilon=1e-8)) tdf = TicDatFactory(t_one=[["Field One", "Field Two"], []], t_two=[["Field One"], []]) self.assertTrue(tdf.pgsql.find_duplicates(self.engine, schema)) self.assertFalse( tdf.pgsql.find_duplicates(self.engine, schema, active_fld="da_active"))
def _same_data(self, obj1, obj2, epsilon = 0): from ticdat import TicDatFactory sch = self.schema() for t in self.generic_tables: if set(getattr(obj1, t).columns) != set(getattr(obj2, t).columns): return False sch[t] = [[], list(getattr(obj1, t).columns)] tdf = TicDatFactory(**sch) return tdf._same_data(self._copy_to_tic_dat(obj1, keep_generics_as_df=False), self._copy_to_tic_dat(obj2, keep_generics_as_df=False), epsilon=epsilon)
def testDietWithInfFlagging(self): tdf = diet_schema.clone() dat = tdf.copy_tic_dat(diet_dat) tdf.set_infinity_io_flag(999999999) schema = test_schema + "_diet_inf_flagging" tdf.pgsql.write_schema(self.engine, schema) tdf.pgsql.write_data(dat, self.engine, schema) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = tdf.clone() dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = TicDatFactory(**diet_schema.schema()) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertFalse(tdf._same_data(dat, dat_1)) self.assertTrue( dat_1.categories["protein"]["Max Nutrition"] == 999999999) dat_1.categories["protein"]["Max Nutrition"] = float("inf") self.assertTrue(tdf._same_data(dat, dat_1))
def testNullsAndInf(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, nullable=True) dat = tdf.TicDat( table=[[None, 100], [200, 109], [0, 300], [300, None], [400, 0]]) schema = test_schema + "_bool_defaults" tdf.pgsql.write_schema(self.engine, schema, include_ancillary_info=False) tdf.pgsql.write_data(dat, self.engine, schema) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat(table=[[float("inf"), 100], [200, 109], [0, 300], [300, float("inf")], [400, 0]]) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf.pgsql.write_data(dat_inf, self.engine, schema) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertFalse(tdf._same_data(dat_inf, dat_1)) dat_inf = tdf.TicDat(table=[[float("-inf"), 100], [200, 109], [0, 300], [300, -float("inf")], [400, 0]]) self.assertTrue(tdf._same_data(dat_inf, dat_1))
def test_parameters(self): schema = test_schema + "_parameters" tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.add_parameter("Something", 100) tdf.add_parameter("Different", 'boo', strings_allowed='*', number_allowed=False) dat = tdf.TicDat( parameters=[["Something", float("inf")], ["Different", "inf"]]) tdf.pgsql.write_schema(self.engine, schema) tdf.pgsql.write_data(dat, self.engine, schema) dat_ = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(dat, dat_))
def test_missing_tables(self): schema = test_schema + "_missing_tables" tdf_1 = TicDatFactory(this=[["Something"], ["Another"]]) pdf_1 = PanDatFactory(**tdf_1.schema()) tdf_2 = TicDatFactory( **dict(tdf_1.schema(), that=[["What", "Ever"], []])) pdf_2 = PanDatFactory(**tdf_2.schema()) dat = tdf_1.TicDat(this=[["a", 2], ["b", 3], ["c", 5]]) pan_dat = tdf_1.copy_to_pandas(dat, drop_pk_columns=False) tdf_1.pgsql.write_schema(self.engine, schema) tdf_1.pgsql.write_data(dat, self.engine, schema) pg_dat = tdf_2.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf_1._same_data(dat, pg_dat)) pg_pan_dat = pdf_2.pgsql.create_pan_dat(self.engine, schema) self.assertTrue(pdf_1._same_data(pan_dat, pg_pan_dat))
def test_wtf(self): schema = "wtf" tdf = TicDatFactory( table_one=[["Cost per Distance", "Cost per Hr. (in-transit)"], ["Stuff"]], table_two=[["This", "That"], ["Tho"]]) tdf.pgsql.write_schema(self.engine, schema) data = [["a", "b", 1], ["dd", "ee", 10], ["023", "210", 102.1]] tic_dat = tdf.TicDat(table_one=data, table_two=data) tdf.pgsql.write_data(tic_dat, self.engine, schema, dsn=self.postgresql.dsn()) pg_tic_dat = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertTrue(tdf._same_data(tic_dat, pg_tic_dat))
def test_pgtd_active(self): if not self.can_run: return schema = test_schema + "_active" tdf = TicDatFactory( **{ k: [pks, (["active_fld"] if k == "categories" else []) + dfs] for k, (pks, dfs) in diet_schema.schema().items() }) tdf.pgsql.write_schema(self.engine, schema, include_ancillary_info=False, forced_field_types={ ('categories', 'active_fld'): 'boolean' }) tdf = diet_schema.clone() dat = tdf.copy_tic_dat(diet_dat) dat.categories["junk"] = {} tdf.pgsql.write_data(dat, self.engine, schema, active_fld="active_fld") self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True}) self.engine.execute( f"Update {schema}.categories set active_fld = False where name = 'junk'" ) dat_2 = tdf.pgsql.create_tic_dat(self.engine, schema, active_fld="active_fld") self.assertTrue(tdf._same_data(dat_2, diet_dat, epsilon=1e-10)) pdf = PanDatFactory.create_from_full_schema( diet_schema.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(diet_dat, drop_pk_columns=False) pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine, schema, active_fld="active_fld") self.assertTrue(pdf._same_data(pan_dat, pan_dat_2, epsilon=1e-10)) self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True, False}) pdf.pgsql.write_data(pan_dat, self.engine, schema, active_fld="active_fld") self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True})
def test_true_false(self): if not self.can_run: return tdf = TicDatFactory(table=[["pkf"], ["df1", "df2"]]) tdf.set_data_type("table", "df2", min=-float("inf")) dat = tdf.TicDat(table=[["d1", True, 100], ["d2", False, 200], ["d3", False, -float("inf")]]) self.assertTrue(len(dat.table) == 3) self.assertFalse(tdf.find_data_type_failures(dat)) pgtf = tdf.pgsql ex = None try: pgtf.write_data(None, self.engine, test_schema) except utils.TicDatError as te: ex = str(te) self.assertTrue(ex and "Not a valid TicDat object" in ex) pgtf.write_schema(self.engine, test_schema, forced_field_types={("table", "df1"): "bool"}) pgtf.write_data(dat, self.engine, test_schema) self.assertFalse(pgtf.find_duplicates(self.engine, test_schema)) pg_tic_dat = pgtf.create_tic_dat(self.engine, test_schema) self.assertTrue(tdf._same_data(dat, pg_tic_dat))
def test_ints_and_strings_and_lists(self): if not self.can_run: return tdf = TicDatFactory(t_one=[[], ["str_field", "int_field"]], t_two=[["str_field", "int_field"], []]) for t in tdf.all_tables: tdf.set_data_type(t, "str_field", strings_allowed=['This', 'That'], number_allowed=False) tdf.set_data_type(t, "int_field", must_be_int=True) dat = tdf.TicDat(t_one=[["This", 1], ["That", 2], ["This", 111], ["That", 211]], t_two=[["This", 10], ["That", 9]]) self.assertFalse(tdf.find_data_type_failures(dat)) self.assertTrue(len(dat.t_one) == 4) self.assertTrue(len(dat.t_two) == 2) pgtf = tdf.pgsql pgtf.write_schema(self.engine, test_schema) pgtf.write_data(dat, self.engine, test_schema) self.assertFalse(pgtf.find_duplicates(self.engine, test_schema)) pg_tic_dat = pgtf.create_tic_dat(self.engine, test_schema) self.assertTrue(tdf._same_data(dat, pg_tic_dat))
def testDateTime(self): schema = test_schema + "_datetime" tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) tdf.add_parameter("p1", "Dec 15 1970", datetime=True) tdf.add_parameter("p2", None, datetime=True, nullable=True) tdf.set_data_type("table_with_stuffs", "field one", datetime=True) tdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = tdf.TicDat(table_with_stuffs=[[ dateutil.parser.parse("July 11 1972"), None ], [datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011")]], parameters=[["p1", "7/11/1911"], ["p2", None]]) self.assertFalse( tdf.find_data_type_failures(dat) or tdf.find_data_row_failures(dat)) tdf.pgsql.write_schema(self.engine, schema) tdf.pgsql.write_data(dat, self.engine, schema) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertFalse( tdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True)) self.assertTrue( all( len(getattr(dat, t)) == len(getattr(dat_1, t)) for t in tdf.all_tables)) self.assertFalse( tdf.find_data_type_failures(dat_1) or tdf.find_data_row_failures(dat_1)) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], datetime.datetime)) self.assertTrue( all( isinstance(_, datetime.datetime) for _ in dat_1.table_with_stuffs)) self.assertTrue( len([_ for _ in dat_1.table_with_stuffs if pd.isnull(_)]) == 0) self.assertTrue( all( isinstance(_, datetime.datetime) or pd.isnull(_) for v in dat_1.table_with_stuffs.values() for _ in v.values())) self.assertTrue( len([ _ for v in dat_1.table_with_stuffs.values() for _ in v.values() if pd.isnull(_) ]) == 1) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = pdf.pgsql.create_pan_dat(self.engine, schema) dat_2 = pdf.copy_to_tic_dat(pan_dat) # pandas can be a real PIA sometimes, hacking around some weird downcasting for k in list(dat_2.table_with_stuffs): dat_2.table_with_stuffs[pd.Timestamp( k)] = dat_2.table_with_stuffs.pop(k) self.assertTrue( tdf._same_data(dat_1, dat_2, nans_are_same_for_data_rows=True)) pdf.pgsql.write_data(pan_dat, self.engine, schema) dat_3 = pdf.copy_to_tic_dat( pdf.pgsql.create_pan_dat(self.engine, schema)) for k in list(dat_3.table_with_stuffs): dat_3.table_with_stuffs[pd.Timestamp( k)] = dat_3.table_with_stuffs.pop(k) self.assertTrue( tdf._same_data(dat_1, dat_3, nans_are_same_for_data_rows=True))