def find_duplicates(self, json_file_path, from_pandas=False): """ Find the row counts for duplicated rows. :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :param from_pandas: boolean. If truthy, then use pandas json readers. See PanDatFactory json readers for more details. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the json entry with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory) if from_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) _rtn = pdf.json.create_pan_dat(json_file_path) jdict = { t: [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)] for t in pdf.all_tables } else: jdict = self._create_jdict(json_file_path) rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict) return rtn or {}
def _add_inflow_table(full_schema_dict): # as per the clone docstring, this function will take a full_schema_dict as argument and # return the PanDatFactory we want to make.. in this case, all we need to do is add inflow. full_schema_dict["tables_fields"]["inflow"] = [["Commodity", "Node"], ["Quantity"]] rtn = PanDatFactory.create_from_full_schema(full_schema_dict) return rtn
def test_issue_68_pd(self): # kind of a dumb test since the numpy types tend to be the ones pandas creates naturally, but no harm # in being rigorous if not self.can_run: return tdf = diet_schema.clone() pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pgtf = pdf.pgsql pgtf.write_schema(self.engine, test_schema, include_ancillary_info=False) dat = tdf.copy_tic_dat(diet_dat) import numpy dat.categories["protein"]["Max Nutrition"] = numpy.int64(200) dat.categories["fat"]["Max Nutrition"] = numpy.float64(65) pan_dat = pdf.copy_pan_dat( tdf.copy_to_pandas(dat, drop_pk_columns=False)) pgtf.write_data(pan_dat, self.engine, test_schema) pg_pan_dat = pgtf.create_pan_dat(self.engine, test_schema) self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat)) from ticdat.pandatfactory import _faster_df_apply pan_dat.categories["Max Nutrition"] = _faster_df_apply( pan_dat.categories, lambda row: numpy.int64(row["Max Nutrition"])) pan_dat.foods["Cost"] = _faster_df_apply( pan_dat.foods, lambda row: numpy.float64(row["Cost"])) from framework_utils.helper_utils import memo memo(pan_dat) pgtf.write_data(pan_dat, self.engine, test_schema) pg_pan_dat = pgtf.create_pan_dat(self.engine, test_schema) self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
def test_time_stamp(self): tdf = TicDatFactory(table=[["Blah"], ["Timed Info"]]) tdf.set_data_type("table", "Timed Info", nullable=True) tdf.set_default_value("table", "Timed Info", None) dat = tdf.TicDat() dat.table[1] = dateutil.parser.parse("2014-05-01 18:47:05.069722") dat.table[2] = dateutil.parser.parse("2014-05-02 18:47:05.178768") pgtf = tdf.pgsql pgtf.write_schema(self.engine, test_schema, forced_field_types={ ('table', 'Blah'): "integer", ('table', 'Timed Info'): "timestamp" }) pgtf.write_data(dat, self.engine, test_schema, dsn=self.postgresql.dsn()) dat_2 = pgtf.create_tic_dat(self.engine, test_schema) self.assertTrue(tdf._same_data(dat, dat_2)) self.assertTrue( all( isinstance(row["Timed Info"], datetime.datetime) for row in dat_2.table.values())) self.assertFalse( any(isinstance(k, datetime.datetime) for k in dat_2.table)) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) def same_data(pan_dat, pan_dat_2): df1, df2 = pan_dat.table, pan_dat_2.table if list(df1["Blah"]) != list(df2["Blah"]): return False for dt1, dt2 in zip(df1["Timed Info"], df2["Timed Info"]): delta = dt1 - dt2 if abs(delta.total_seconds()) > 1e-6: return False return True pan_dat = pdf.pgsql.create_pan_dat(self.engine, test_schema) pan_dat_2 = pan_dat_maker(tdf.schema(), dat_2) self.assertTrue(same_data(pan_dat, pan_dat_2)) for df in [_.table for _ in [pan_dat, pan_dat_2]]: for i in range(len(df)): self.assertFalse( isinstance(df.loc[i, "Blah"], datetime.datetime)) self.assertTrue( isinstance(df.loc[i, "Timed Info"], datetime.datetime)) pan_dat.table.loc[1, "Timed Info"] = dateutil.parser.parse( "2014-05-02 18:48:05.178768") self.assertFalse(same_data(pan_dat, pan_dat_2)) pdf.pgsql.write_data(pan_dat, self.engine, test_schema) pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine, test_schema) self.assertTrue(same_data(pan_dat, pan_dat_2)) dat.table[2] = dateutil.parser.parse("2014-05-02 18:48:05.178768") self.assertFalse(tdf._same_data(dat, dat_2))
def test_pdf_2(self): pdf = PanDatFactory.create_from_full_schema(kehaar.input_schema.schema(include_ancillary_info=True)) pdf.set_infinity_io_flag("N/A") # this speeds thing up, since less munging dat = _timeit(pdf.csv.create_pan_dat, 5)(os.path.join(_codeDir(), "bernardo_slowby")) pdf.pgsql.write_schema(self.engine, test_schemas[2], include_ancillary_info=False, forced_field_types=_forced_field_types()) _timeit(pdf.pgsql.write_data, 90)(dat, self.engine, test_schemas[2]) _timeit(pdf.pgsql.create_pan_dat, 5)(self.engine, test_schemas[2])
def test_pdf(self): pdf = PanDatFactory.create_from_full_schema(kehaar.input_schema.schema(include_ancillary_info=True)) dat = _timeit(pdf.csv.create_pan_dat, 90)(os.path.join(_codeDir(), "bernardo_slowby")) pdf.pgsql.write_schema(self.engine, test_schemas[1], include_ancillary_info=False, forced_field_types=_forced_field_types()) # it takes a bit longer because thare might be infinities to manage into PG _timeit(pdf.pgsql.write_data, 180)(dat, self.engine, test_schemas[1]) _timeit(pdf.pgsql.create_pan_dat, 50)(self.engine, test_schemas[1])
def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False, to_pandas=False): """ write the ticDat data to a json file (or json string) :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. If empty string, then return a JSON string. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :param to_pandas: boolean. if truthy, then use the PanDatFactory method of writing to json. :return: """ _standard_verify(self.tic_dat_factory) verify(not (to_pandas and verbose), "verbose argument is inconsistent with to_pandas") verify( not (json_file_path and os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) if to_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) return pdf.json.write_file( self.tic_dat_factory.copy_to_pandas(tic_dat, drop_pk_columns=False), json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose, use_infinity_io_flag_if_provided=True) if not json_file_path: return json.dumps(jdict, sort_keys=True, indent=2) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
def test_pgtd_active(self): if not self.can_run: return schema = test_schema + "_active" tdf = TicDatFactory( **{ k: [pks, (["active_fld"] if k == "categories" else []) + dfs] for k, (pks, dfs) in diet_schema.schema().items() }) tdf.pgsql.write_schema(self.engine, schema, include_ancillary_info=False, forced_field_types={ ('categories', 'active_fld'): 'boolean' }) tdf = diet_schema.clone() dat = tdf.copy_tic_dat(diet_dat) dat.categories["junk"] = {} tdf.pgsql.write_data(dat, self.engine, schema, active_fld="active_fld") self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True}) self.engine.execute( f"Update {schema}.categories set active_fld = False where name = 'junk'" ) dat_2 = tdf.pgsql.create_tic_dat(self.engine, schema, active_fld="active_fld") self.assertTrue(tdf._same_data(dat_2, diet_dat, epsilon=1e-10)) pdf = PanDatFactory.create_from_full_schema( diet_schema.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(diet_dat, drop_pk_columns=False) pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine, schema, active_fld="active_fld") self.assertTrue(pdf._same_data(pan_dat, pan_dat_2, epsilon=1e-10)) self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True, False}) pdf.pgsql.write_data(pan_dat, self.engine, schema, active_fld="active_fld") self.assertTrue( set(_[0] for _ in self.engine.execute( f"Select active_fld from {schema}.categories")) == {True})
def test_diet_no_inf_pd_flagging(self): pdf = PanDatFactory.create_from_full_schema( diet_schema.schema(include_ancillary_info=True)) pan_dat = diet_schema.copy_to_pandas(diet_dat, drop_pk_columns=False) pgpf = pdf.pgsql pgpf.write_schema(self.engine, test_schema, include_ancillary_info=False) pgpf.write_data(pan_dat, self.engine, test_schema) self.assertTrue( sorted([ _ for _ in self.engine.execute( f"Select * from {test_schema}.categories") ]) == [('calories', 1800.0, 2200.0), ('fat', 0.0, 65.0), ('protein', 91.0, float("inf")), ('sodium', 0.0, 1779.0)]) pg_pan_dat = pgpf.create_pan_dat(self.engine, test_schema) self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
def create_tic_dat(self, json_file_path, freeze_it=False, from_pandas=False): """ Create a TicDat object from a json file :param json_file_path: A json file path. It should encode a dictionary with table names as keys. Could also be an actual JSON string :param freeze_it: boolean. should the returned object be frozen? :param from_pandas: boolean. If truthy, then use pandas json readers. See PanDatFactory json readers for more details. :return: a TicDat object populated by the matching tables. caveats: Table names matches are case insensitive and also underscore-space insensitive. Tables that don't find a match are interpreted as an empty table. Dictionary keys that don't match any table are ignored. """ _standard_verify(self.tic_dat_factory) if from_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) _rtn = pdf.json.create_pan_dat(json_file_path) return pdf.copy_to_tic_dat(_rtn) jdict = self._create_jdict(json_file_path) tic_dat_dict = self._create_tic_dat_dict(jdict) missing_tables = set( self.tic_dat_factory.all_tables).difference(tic_dat_dict) if missing_tables: print( "The following table names could not be found in the json file/string\n%s\n" % "\n".join(missing_tables)) rtn = self.tic_dat_factory.TicDat(**tic_dat_dict) rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn
def testDietWithInfFlaggingPd(self): pdf = PanDatFactory.create_from_full_schema( diet_schema.schema(include_ancillary_info=True)) dat = diet_schema.copy_to_pandas(diet_dat, drop_pk_columns=False) pdf.set_infinity_io_flag(999999999) schema = test_schema + "_diet_inf_flagging_pd" pdf.pgsql.write_schema(self.engine, schema) pdf.pgsql.write_data(dat, self.engine, schema) dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue(pdf._same_data(dat, dat_1)) pdf = pdf.clone() dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue(pdf._same_data(dat, dat_1)) tdf = PanDatFactory(**diet_schema.schema()) dat_1 = tdf.pgsql.create_pan_dat(self.engine, schema) self.assertFalse(tdf._same_data(dat, dat_1)) protein = dat_1.categories["Name"] == "protein" self.assertTrue( list(dat_1.categories[protein]["Max Nutrition"])[0] == 999999999) dat_1.categories.loc[protein, "Max Nutrition"] = float("inf") self.assertTrue(tdf._same_data(dat, dat_1))
def testDateTime(self): schema = test_schema + "_datetime" tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) tdf.add_parameter("p1", "Dec 15 1970", datetime=True) tdf.add_parameter("p2", None, datetime=True, nullable=True) tdf.set_data_type("table_with_stuffs", "field one", datetime=True) tdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = tdf.TicDat(table_with_stuffs=[[ dateutil.parser.parse("July 11 1972"), None ], [datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011")]], parameters=[["p1", "7/11/1911"], ["p2", None]]) self.assertFalse( tdf.find_data_type_failures(dat) or tdf.find_data_row_failures(dat)) tdf.pgsql.write_schema(self.engine, schema) tdf.pgsql.write_data(dat, self.engine, schema) dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema) self.assertFalse( tdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True)) self.assertTrue( all( len(getattr(dat, t)) == len(getattr(dat_1, t)) for t in tdf.all_tables)) self.assertFalse( tdf.find_data_type_failures(dat_1) or tdf.find_data_row_failures(dat_1)) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], datetime.datetime)) self.assertTrue( all( isinstance(_, datetime.datetime) for _ in dat_1.table_with_stuffs)) self.assertTrue( len([_ for _ in dat_1.table_with_stuffs if pd.isnull(_)]) == 0) self.assertTrue( all( isinstance(_, datetime.datetime) or pd.isnull(_) for v in dat_1.table_with_stuffs.values() for _ in v.values())) self.assertTrue( len([ _ for v in dat_1.table_with_stuffs.values() for _ in v.values() if pd.isnull(_) ]) == 1) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = pdf.pgsql.create_pan_dat(self.engine, schema) dat_2 = pdf.copy_to_tic_dat(pan_dat) # pandas can be a real PIA sometimes, hacking around some weird downcasting for k in list(dat_2.table_with_stuffs): dat_2.table_with_stuffs[pd.Timestamp( k)] = dat_2.table_with_stuffs.pop(k) self.assertTrue( tdf._same_data(dat_1, dat_2, nans_are_same_for_data_rows=True)) pdf.pgsql.write_data(pan_dat, self.engine, schema) dat_3 = pdf.copy_to_tic_dat( pdf.pgsql.create_pan_dat(self.engine, schema)) for k in list(dat_3.table_with_stuffs): dat_3.table_with_stuffs[pd.Timestamp( k)] = dat_3.table_with_stuffs.pop(k) self.assertTrue( tdf._same_data(dat_1, dat_3, nans_are_same_for_data_rows=True))
def test_diet_pd(self): if not self.can_run: return schema = "test_pg_diet" tdf = diet_schema pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pdf.set_infinity_io_flag(1e12) pgpf = pdf.pgsql pan_dat = pan_dat_maker(tdf.schema(), diet_dat) pgpf.write_schema(self.engine, schema, include_ancillary_info=False) pgpf.write_data(pan_dat, self.engine, schema) pg_pan_dat = pgpf.create_pan_dat(self.engine, schema) self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat)) pdf.set_infinity_io_flag(None) pg_pan_dat_none_inf = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat_none_inf)) pg_pan_dat_none_inf.categories.loc[pg_pan_dat_none_inf.categories["Name"] == "protein", "Max Nutrition"] = \ float("inf") self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat_none_inf)) pdf.set_infinity_io_flag("N/A") dat2 = diet_schema.copy_tic_dat(diet_dat) dat2.foods["za"] = dat2.foods.pop("pizza") dat2 = pan_dat_maker(tdf.schema(), dat2) pgpf.write_data(dat2, self.engine, schema, pre_existing_rows={"foods": "append"}) dat3 = pgpf.create_pan_dat(self.engine, schema) self.assertTrue(set(pdf.find_duplicates(dat3)) == {'foods'}) self.assertTrue(set(dat3.foods["Name"]).issuperset(dat2.foods["Name"])) self.assertTrue( set(dat3.foods["Name"]).issuperset(pan_dat.foods["Name"])) self.assertTrue( set(dat3.foods["Name"]).difference(pan_dat.foods["Name"]) == {'za'}) self.assertTrue( set(dat3.foods["Name"]).difference(dat2.foods["Name"]) == {'pizza'}) pgpf.write_data(dat2, self.engine, schema, pre_existing_rows={"nutrition_quantities": "append"}) dat4 = pgpf.create_pan_dat(self.engine, schema) self.assertTrue( set(pdf.find_duplicates(dat4)) == {'nutrition_quantities'} and not pdf.find_duplicates(dat2)) dat4.nutrition_quantities = dat4.nutrition_quantities[:36] self.assertFalse(pdf.find_duplicates(dat4)) self.assertTrue(pdf._same_data(dat2, dat4)) test_schema_2 = schema + "_none_inf" pdf.set_infinity_io_flag(None) pgpf.write_schema(self.engine, test_schema_2) pgpf.write_data(pan_dat, self.engine, test_schema_2) pdf.set_infinity_io_flag("N/A") pg_pan_dat = pgpf.create_pan_dat(self.engine, test_schema_2) self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat)) pg_pan_dat.categories.loc[pg_pan_dat.categories["Name"] == "protein", "Max Nutrition"] = float("inf") self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat)) pdf.set_infinity_io_flag(None) pg_pan_dat_none_inf = pgpf.create_pan_dat(self.engine, test_schema_2) self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat_none_inf)) pdf_ = PanDatFactory(**diet_schema.schema()) # doesnt have data types pdf_.set_infinity_io_flag(None) pgpf_null_inf = pdf_.pgsql pg_pan_dat_none_inf = pgpf_null_inf.create_pan_dat( self.engine, test_schema_2) self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat_none_inf)) self.assertTrue( math.isnan(pg_pan_dat_none_inf.categories[ pg_pan_dat_none_inf.categories["Name"] == "protein"] ["Max Nutrition"][0]))