Exemplo n.º 1
0
    def find_duplicates(self, json_file_path, from_pandas=False):
        """
        Find the row counts for duplicated rows.

        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys.

        :param from_pandas: boolean.  If truthy, then use pandas json readers. See
                            PanDatFactory json readers for more details.

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the json entry with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        _standard_verify(self.tic_dat_factory)
        if from_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            _rtn = pdf.json.create_pan_dat(json_file_path)
            jdict = {
                t:
                [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)]
                for t in pdf.all_tables
            }
        else:
            jdict = self._create_jdict(json_file_path)
        rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict)
        return rtn or {}
Exemplo n.º 2
0
def _add_inflow_table(full_schema_dict):
    # as per the clone docstring, this function will take a full_schema_dict as argument and
    # return the  PanDatFactory we want to make.. in this case, all we need to do is add inflow.
    full_schema_dict["tables_fields"]["inflow"] = [["Commodity", "Node"],
                                                   ["Quantity"]]
    rtn = PanDatFactory.create_from_full_schema(full_schema_dict)
    return rtn
Exemplo n.º 3
0
 def test_issue_68_pd(self):
     # kind of a dumb test since the numpy types tend to be the ones pandas creates naturally, but no harm
     # in being rigorous
     if not self.can_run:
         return
     tdf = diet_schema.clone()
     pdf = PanDatFactory.create_from_full_schema(
         tdf.schema(include_ancillary_info=True))
     pgtf = pdf.pgsql
     pgtf.write_schema(self.engine,
                       test_schema,
                       include_ancillary_info=False)
     dat = tdf.copy_tic_dat(diet_dat)
     import numpy
     dat.categories["protein"]["Max Nutrition"] = numpy.int64(200)
     dat.categories["fat"]["Max Nutrition"] = numpy.float64(65)
     pan_dat = pdf.copy_pan_dat(
         tdf.copy_to_pandas(dat, drop_pk_columns=False))
     pgtf.write_data(pan_dat, self.engine, test_schema)
     pg_pan_dat = pgtf.create_pan_dat(self.engine, test_schema)
     self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
     from ticdat.pandatfactory import _faster_df_apply
     pan_dat.categories["Max Nutrition"] = _faster_df_apply(
         pan_dat.categories, lambda row: numpy.int64(row["Max Nutrition"]))
     pan_dat.foods["Cost"] = _faster_df_apply(
         pan_dat.foods, lambda row: numpy.float64(row["Cost"]))
     from framework_utils.helper_utils import memo
     memo(pan_dat)
     pgtf.write_data(pan_dat, self.engine, test_schema)
     pg_pan_dat = pgtf.create_pan_dat(self.engine, test_schema)
     self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
Exemplo n.º 4
0
    def test_time_stamp(self):
        tdf = TicDatFactory(table=[["Blah"], ["Timed Info"]])
        tdf.set_data_type("table", "Timed Info", nullable=True)
        tdf.set_default_value("table", "Timed Info", None)
        dat = tdf.TicDat()
        dat.table[1] = dateutil.parser.parse("2014-05-01 18:47:05.069722")
        dat.table[2] = dateutil.parser.parse("2014-05-02 18:47:05.178768")
        pgtf = tdf.pgsql
        pgtf.write_schema(self.engine,
                          test_schema,
                          forced_field_types={
                              ('table', 'Blah'): "integer",
                              ('table', 'Timed Info'): "timestamp"
                          })
        pgtf.write_data(dat,
                        self.engine,
                        test_schema,
                        dsn=self.postgresql.dsn())
        dat_2 = pgtf.create_tic_dat(self.engine, test_schema)
        self.assertTrue(tdf._same_data(dat, dat_2))
        self.assertTrue(
            all(
                isinstance(row["Timed Info"], datetime.datetime)
                for row in dat_2.table.values()))
        self.assertFalse(
            any(isinstance(k, datetime.datetime) for k in dat_2.table))

        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))

        def same_data(pan_dat, pan_dat_2):
            df1, df2 = pan_dat.table, pan_dat_2.table
            if list(df1["Blah"]) != list(df2["Blah"]):
                return False
            for dt1, dt2 in zip(df1["Timed Info"], df2["Timed Info"]):
                delta = dt1 - dt2
                if abs(delta.total_seconds()) > 1e-6:
                    return False
            return True

        pan_dat = pdf.pgsql.create_pan_dat(self.engine, test_schema)
        pan_dat_2 = pan_dat_maker(tdf.schema(), dat_2)
        self.assertTrue(same_data(pan_dat, pan_dat_2))
        for df in [_.table for _ in [pan_dat, pan_dat_2]]:
            for i in range(len(df)):
                self.assertFalse(
                    isinstance(df.loc[i, "Blah"], datetime.datetime))
                self.assertTrue(
                    isinstance(df.loc[i, "Timed Info"], datetime.datetime))

        pan_dat.table.loc[1, "Timed Info"] = dateutil.parser.parse(
            "2014-05-02 18:48:05.178768")
        self.assertFalse(same_data(pan_dat, pan_dat_2))
        pdf.pgsql.write_data(pan_dat, self.engine, test_schema)
        pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine, test_schema)
        self.assertTrue(same_data(pan_dat, pan_dat_2))

        dat.table[2] = dateutil.parser.parse("2014-05-02 18:48:05.178768")
        self.assertFalse(tdf._same_data(dat, dat_2))
Exemplo n.º 5
0
 def test_pdf_2(self):
     pdf = PanDatFactory.create_from_full_schema(kehaar.input_schema.schema(include_ancillary_info=True))
     pdf.set_infinity_io_flag("N/A") # this speeds thing up, since less munging
     dat = _timeit(pdf.csv.create_pan_dat, 5)(os.path.join(_codeDir(), "bernardo_slowby"))
     pdf.pgsql.write_schema(self.engine, test_schemas[2], include_ancillary_info=False,
                            forced_field_types=_forced_field_types())
     _timeit(pdf.pgsql.write_data, 90)(dat, self.engine, test_schemas[2])
     _timeit(pdf.pgsql.create_pan_dat, 5)(self.engine, test_schemas[2])
Exemplo n.º 6
0
 def test_pdf(self):
     pdf = PanDatFactory.create_from_full_schema(kehaar.input_schema.schema(include_ancillary_info=True))
     dat = _timeit(pdf.csv.create_pan_dat, 90)(os.path.join(_codeDir(), "bernardo_slowby"))
     pdf.pgsql.write_schema(self.engine, test_schemas[1], include_ancillary_info=False,
                            forced_field_types=_forced_field_types())
     # it takes a bit longer because thare might be infinities to manage into PG
     _timeit(pdf.pgsql.write_data, 180)(dat, self.engine, test_schemas[1])
     _timeit(pdf.pgsql.create_pan_dat, 50)(self.engine, test_schemas[1])
Exemplo n.º 7
0
    def write_file(self,
                   tic_dat,
                   json_file_path,
                   allow_overwrite=False,
                   verbose=False,
                   to_pandas=False):
        """
        write the ticDat data to a json file (or json string)

        :param tic_dat: the data object to write (typically a TicDat)

        :param json_file_path: The file path of the json file to create. If empty string, then return a JSON string.

        :param allow_overwrite: boolean - are we allowed to overwrite an
                                existing file?

        :param verbose: boolean. Verbose mode writes the data rows as dicts
                        keyed by field name. Otherwise, they are lists.

        :param to_pandas: boolean. if truthy, then use the PanDatFactory method of writing to json.

        :return:
        """
        _standard_verify(self.tic_dat_factory)
        verify(not (to_pandas and verbose),
               "verbose argument is inconsistent with to_pandas")
        verify(
            not (json_file_path and os.path.exists(json_file_path)
                 and not allow_overwrite),
            "%s exists and allow_overwrite is not enabled" % json_file_path)
        if to_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            return pdf.json.write_file(
                self.tic_dat_factory.copy_to_pandas(tic_dat,
                                                    drop_pk_columns=False),
                json_file_path)
        msg = []
        if not self.tic_dat_factory.good_tic_dat_object(
                tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        jdict = make_json_dict(self.tic_dat_factory,
                               tic_dat,
                               verbose,
                               use_infinity_io_flag_if_provided=True)
        if not json_file_path:
            return json.dumps(jdict, sort_keys=True, indent=2)
        with open(json_file_path, "w") as fp:
            json.dump(jdict, fp, sort_keys=True, indent=2)
Exemplo n.º 8
0
    def test_pgtd_active(self):
        if not self.can_run:
            return
        schema = test_schema + "_active"
        tdf = TicDatFactory(
            **{
                k: [pks, (["active_fld"] if k == "categories" else []) + dfs]
                for k, (pks, dfs) in diet_schema.schema().items()
            })
        tdf.pgsql.write_schema(self.engine,
                               schema,
                               include_ancillary_info=False,
                               forced_field_types={
                                   ('categories', 'active_fld'): 'boolean'
                               })
        tdf = diet_schema.clone()
        dat = tdf.copy_tic_dat(diet_dat)
        dat.categories["junk"] = {}
        tdf.pgsql.write_data(dat, self.engine, schema, active_fld="active_fld")
        self.assertTrue(
            set(_[0] for _ in self.engine.execute(
                f"Select active_fld from {schema}.categories")) == {True})
        self.engine.execute(
            f"Update {schema}.categories set active_fld = False where name = 'junk'"
        )
        dat_2 = tdf.pgsql.create_tic_dat(self.engine,
                                         schema,
                                         active_fld="active_fld")
        self.assertTrue(tdf._same_data(dat_2, diet_dat, epsilon=1e-10))

        pdf = PanDatFactory.create_from_full_schema(
            diet_schema.schema(include_ancillary_info=True))
        pan_dat = tdf.copy_to_pandas(diet_dat, drop_pk_columns=False)
        pan_dat_2 = pdf.pgsql.create_pan_dat(self.engine,
                                             schema,
                                             active_fld="active_fld")
        self.assertTrue(pdf._same_data(pan_dat, pan_dat_2, epsilon=1e-10))
        self.assertTrue(
            set(_[0] for _ in self.engine.execute(
                f"Select active_fld from {schema}.categories")) ==
            {True, False})
        pdf.pgsql.write_data(pan_dat,
                             self.engine,
                             schema,
                             active_fld="active_fld")
        self.assertTrue(
            set(_[0] for _ in self.engine.execute(
                f"Select active_fld from {schema}.categories")) == {True})
Exemplo n.º 9
0
 def test_diet_no_inf_pd_flagging(self):
     pdf = PanDatFactory.create_from_full_schema(
         diet_schema.schema(include_ancillary_info=True))
     pan_dat = diet_schema.copy_to_pandas(diet_dat, drop_pk_columns=False)
     pgpf = pdf.pgsql
     pgpf.write_schema(self.engine,
                       test_schema,
                       include_ancillary_info=False)
     pgpf.write_data(pan_dat, self.engine, test_schema)
     self.assertTrue(
         sorted([
             _ for _ in self.engine.execute(
                 f"Select * from {test_schema}.categories")
         ]) == [('calories', 1800.0,
                 2200.0), ('fat', 0.0,
                           65.0), ('protein', 91.0,
                                   float("inf")), ('sodium', 0.0, 1779.0)])
     pg_pan_dat = pgpf.create_pan_dat(self.engine, test_schema)
     self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
Exemplo n.º 10
0
    def create_tic_dat(self,
                       json_file_path,
                       freeze_it=False,
                       from_pandas=False):
        """
        Create a TicDat object from a json file

        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys. Could also be an actual JSON string

        :param freeze_it: boolean. should the returned object be frozen?

        :param from_pandas: boolean.  If truthy, then use pandas json readers. See
                            PanDatFactory json readers for more details.

        :return: a TicDat object populated by the matching tables.

        caveats: Table names matches are case insensitive and also
                 underscore-space insensitive.
                 Tables that don't find a match are interpreted as an empty table.
                 Dictionary keys that don't match any table are ignored.
        """
        _standard_verify(self.tic_dat_factory)
        if from_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            _rtn = pdf.json.create_pan_dat(json_file_path)
            return pdf.copy_to_tic_dat(_rtn)
        jdict = self._create_jdict(json_file_path)
        tic_dat_dict = self._create_tic_dat_dict(jdict)
        missing_tables = set(
            self.tic_dat_factory.all_tables).difference(tic_dat_dict)
        if missing_tables:
            print(
                "The following table names could not be found in the json file/string\n%s\n"
                % "\n".join(missing_tables))
        rtn = self.tic_dat_factory.TicDat(**tic_dat_dict)
        rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn)
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn
Exemplo n.º 11
0
 def testDietWithInfFlaggingPd(self):
     pdf = PanDatFactory.create_from_full_schema(
         diet_schema.schema(include_ancillary_info=True))
     dat = diet_schema.copy_to_pandas(diet_dat, drop_pk_columns=False)
     pdf.set_infinity_io_flag(999999999)
     schema = test_schema + "_diet_inf_flagging_pd"
     pdf.pgsql.write_schema(self.engine, schema)
     pdf.pgsql.write_data(dat, self.engine, schema)
     dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)
     self.assertTrue(pdf._same_data(dat, dat_1))
     pdf = pdf.clone()
     dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema)
     self.assertTrue(pdf._same_data(dat, dat_1))
     tdf = PanDatFactory(**diet_schema.schema())
     dat_1 = tdf.pgsql.create_pan_dat(self.engine, schema)
     self.assertFalse(tdf._same_data(dat, dat_1))
     protein = dat_1.categories["Name"] == "protein"
     self.assertTrue(
         list(dat_1.categories[protein]["Max Nutrition"])[0] == 999999999)
     dat_1.categories.loc[protein, "Max Nutrition"] = float("inf")
     self.assertTrue(tdf._same_data(dat, dat_1))
Exemplo n.º 12
0
    def testDateTime(self):
        schema = test_schema + "_datetime"
        tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        tdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        tdf.add_parameter("p2", None, datetime=True, nullable=True)
        tdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        tdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)

        dat = tdf.TicDat(table_with_stuffs=[[
            dateutil.parser.parse("July 11 1972"), None
        ], [datetime.datetime.now(),
            dateutil.parser.parse("Sept 11 2011")]],
                         parameters=[["p1", "7/11/1911"], ["p2", None]])
        self.assertFalse(
            tdf.find_data_type_failures(dat)
            or tdf.find_data_row_failures(dat))

        tdf.pgsql.write_schema(self.engine, schema)
        tdf.pgsql.write_data(dat, self.engine, schema)
        dat_1 = tdf.pgsql.create_tic_dat(self.engine, schema)
        self.assertFalse(
            tdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
        self.assertTrue(
            all(
                len(getattr(dat, t)) == len(getattr(dat_1, t))
                for t in tdf.all_tables))
        self.assertFalse(
            tdf.find_data_type_failures(dat_1)
            or tdf.find_data_row_failures(dat_1))
        self.assertTrue(
            isinstance(dat_1.parameters["p1"]["b"], datetime.datetime))
        self.assertTrue(
            all(
                isinstance(_, datetime.datetime)
                for _ in dat_1.table_with_stuffs))
        self.assertTrue(
            len([_ for _ in dat_1.table_with_stuffs if pd.isnull(_)]) == 0)
        self.assertTrue(
            all(
                isinstance(_, datetime.datetime) or pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()))
        self.assertTrue(
            len([
                _ for v in dat_1.table_with_stuffs.values()
                for _ in v.values() if pd.isnull(_)
            ]) == 1)
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pan_dat = pdf.pgsql.create_pan_dat(self.engine, schema)
        dat_2 = pdf.copy_to_tic_dat(pan_dat)
        # pandas can be a real PIA sometimes, hacking around some weird downcasting
        for k in list(dat_2.table_with_stuffs):
            dat_2.table_with_stuffs[pd.Timestamp(
                k)] = dat_2.table_with_stuffs.pop(k)
        self.assertTrue(
            tdf._same_data(dat_1, dat_2, nans_are_same_for_data_rows=True))

        pdf.pgsql.write_data(pan_dat, self.engine, schema)
        dat_3 = pdf.copy_to_tic_dat(
            pdf.pgsql.create_pan_dat(self.engine, schema))
        for k in list(dat_3.table_with_stuffs):
            dat_3.table_with_stuffs[pd.Timestamp(
                k)] = dat_3.table_with_stuffs.pop(k)
        self.assertTrue(
            tdf._same_data(dat_1, dat_3, nans_are_same_for_data_rows=True))
Exemplo n.º 13
0
    def test_diet_pd(self):
        if not self.can_run:
            return
        schema = "test_pg_diet"
        tdf = diet_schema
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pdf.set_infinity_io_flag(1e12)
        pgpf = pdf.pgsql
        pan_dat = pan_dat_maker(tdf.schema(), diet_dat)
        pgpf.write_schema(self.engine, schema, include_ancillary_info=False)
        pgpf.write_data(pan_dat, self.engine, schema)
        pg_pan_dat = pgpf.create_pan_dat(self.engine, schema)
        self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
        pdf.set_infinity_io_flag(None)
        pg_pan_dat_none_inf = pdf.pgsql.create_pan_dat(self.engine, schema)
        self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat_none_inf))
        pg_pan_dat_none_inf.categories.loc[pg_pan_dat_none_inf.categories["Name"] == "protein", "Max Nutrition"] = \
            float("inf")
        self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat_none_inf))

        pdf.set_infinity_io_flag("N/A")
        dat2 = diet_schema.copy_tic_dat(diet_dat)
        dat2.foods["za"] = dat2.foods.pop("pizza")
        dat2 = pan_dat_maker(tdf.schema(), dat2)
        pgpf.write_data(dat2,
                        self.engine,
                        schema,
                        pre_existing_rows={"foods": "append"})
        dat3 = pgpf.create_pan_dat(self.engine, schema)
        self.assertTrue(set(pdf.find_duplicates(dat3)) == {'foods'})
        self.assertTrue(set(dat3.foods["Name"]).issuperset(dat2.foods["Name"]))
        self.assertTrue(
            set(dat3.foods["Name"]).issuperset(pan_dat.foods["Name"]))
        self.assertTrue(
            set(dat3.foods["Name"]).difference(pan_dat.foods["Name"]) ==
            {'za'})
        self.assertTrue(
            set(dat3.foods["Name"]).difference(dat2.foods["Name"]) ==
            {'pizza'})
        pgpf.write_data(dat2,
                        self.engine,
                        schema,
                        pre_existing_rows={"nutrition_quantities": "append"})
        dat4 = pgpf.create_pan_dat(self.engine, schema)
        self.assertTrue(
            set(pdf.find_duplicates(dat4)) == {'nutrition_quantities'}
            and not pdf.find_duplicates(dat2))
        dat4.nutrition_quantities = dat4.nutrition_quantities[:36]
        self.assertFalse(pdf.find_duplicates(dat4))
        self.assertTrue(pdf._same_data(dat2, dat4))

        test_schema_2 = schema + "_none_inf"
        pdf.set_infinity_io_flag(None)
        pgpf.write_schema(self.engine, test_schema_2)
        pgpf.write_data(pan_dat, self.engine, test_schema_2)
        pdf.set_infinity_io_flag("N/A")
        pg_pan_dat = pgpf.create_pan_dat(self.engine, test_schema_2)
        self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat))
        pg_pan_dat.categories.loc[pg_pan_dat.categories["Name"] == "protein",
                                  "Max Nutrition"] = float("inf")
        self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat))
        pdf.set_infinity_io_flag(None)
        pg_pan_dat_none_inf = pgpf.create_pan_dat(self.engine, test_schema_2)
        self.assertTrue(pdf._same_data(pan_dat, pg_pan_dat_none_inf))

        pdf_ = PanDatFactory(**diet_schema.schema())  # doesnt have data types
        pdf_.set_infinity_io_flag(None)
        pgpf_null_inf = pdf_.pgsql
        pg_pan_dat_none_inf = pgpf_null_inf.create_pan_dat(
            self.engine, test_schema_2)
        self.assertFalse(pdf._same_data(pan_dat, pg_pan_dat_none_inf))
        self.assertTrue(
            math.isnan(pg_pan_dat_none_inf.categories[
                pg_pan_dat_none_inf.categories["Name"] == "protein"]
                       ["Max Nutrition"][0]))