Пример #1
0
    def create_from_full_schema(full_schema):
        """

        create a PanDatFactory complete with default values, data types, and foreign keys

        :param full_schema: a dictionary consistent with the data returned by a call to schema()
                            with include_ancillary_info = True

        :return: a PanDatFactory reflecting the tables, fields, default values, data types,
                 and foreign keys consistent with the full_schema argument
        """
        verify(dictish(full_schema) and set(full_schema) == {"tables_fields", "foreign_keys",
                                                             "default_values", "data_types"},
               "full_schema should be the result of calling schema(True) for some PanDatFactory")
        fks = full_schema["foreign_keys"]
        verify( (not fks) or (lupish(fks) and all(lupish(_) and len(_) >= 3 for _ in fks)),
                "foreign_keys entry poorly formed")
        dts = full_schema["data_types"]
        verify( (not dts) or (dictish(dts) and all(map(dictish, dts.values())) and
                              all(all(map(lupish, _.values())) for _ in dts.values())),
                "data_types entry poorly formatted")
        dvs = full_schema["default_values"]
        verify( (not dvs) or (dictish(dvs) and all(map(dictish, dvs.values()))),
                "default_values entry poorly formatted")

        rtn = PanDatFactory(**full_schema["tables_fields"])
        for fk in (fks or []):
            rtn.add_foreign_key(*fk[:3])
        for t,fds in (dts or {}).items():
            for f,dt in fds.items():
                rtn.set_data_type(t, f, *dt)
        for t,fdvs in (dvs or {}).items():
            for f, dv in fdvs.items():
                rtn.set_default_value(t,f,dv)
        return rtn
Пример #2
0
    def write_data(self,
                   tic_dat,
                   engine,
                   schema,
                   dsn=None,
                   pre_existing_rows=None):
        """
        write the ticDat data to a PostGres database

        :param tic_dat: the data object to write

        :param engine: a sqlalchemy database engine with drivertype postgres

        :param schema: the postgres schema to write to (call self.write_schema explicitly as needed)

        :param dsn: optional - if truthy, a dict that can be unpacked as arguments to
                    psycopg2.connect. Will speed up bulk writing compared to engine.execute
                    If truthy and not a dict, then will be passed directly to psycopg2.connect as the sole argument.

        :param pre_existing_rows: if provided, a dict mapping table name to either "delete" or "append"
                                  default behavior is "delete"

        :return:
        """
        verify(sa, "sqalchemy needs to be installed to use this subroutine")
        verify(engine.name == 'postgresql',
               "a sqlalchemy engine with drivername='postgres' is required")
        verify(not dsn or psycopg2,
               "need psycopg2 to use the faster dsn write option")
        self._check_good_pgtd_compatible_table_field_names()
        msg = []
        if not self.tdf.good_tic_dat_object(tic_dat, lambda m: msg.append(m)):
            raise TicDatError("Not a valid TicDat object for this schema : " +
                              " : ".join(msg))
        verify(not self.tdf.generic_tables,
               "TicDat for postgres does not yet support generic tables")
        self.check_tables_fields(
            engine, schema,
            error_on_missing_table=True)  # call self.write_schema as needed
        self._handle_prexisting_rows(engine, schema, pre_existing_rows or {})
        if dsn:
            connect_kwargs = dsn if dsn and dictish(dsn) else {}
            connect_args = [dsn] if dsn and not dictish(dsn) else []
            with psycopg2.connect(*connect_args, **connect_kwargs) as db:
                with db.cursor() as cursor:
                    for k, v in self._get_data(tic_dat,
                                               schema,
                                               dump_format="dict").items():
                        psycopg2.extras.execute_values(cursor, k, v)
        else:
            all_dat = self._get_data(tic_dat, schema)
            if len(all_dat) > 1000:
                print(
                    "***pgtd.py not using most efficient data writing technique**"
                )
            for sql_str, data in all_dat:
                engine.execute(sql_str, data)
Пример #3
0
    def _assertSame(self, t1, t2, goodTicDatTable):

        if utils.dictish(t1) or utils.dictish(t2) :
            _ass = lambda _t1, _t2 : assertTicDatTablesSame(_t1, _t2,
                    _goodTicDatTable=  goodTicDatTable,
                    **({} if DEBUG() else {"_assertTrue":self.assertTrue, "_assertFalse":self.assertFalse}))

            _ass(t1, t2)
            _ass(t2, t1)
        else :
            setify = lambda t : set(t) if len(t) and not hasattr(t[0], "values") else {r.values() for r in t}
            self.assertTrue(setify(t1) == setify(t2))
Пример #4
0
def convert_to_dicts_that_can_be_turned_into_DataFrames(
        tdf, dat, field_renamings=None):
    '''
    utility routine to help de-ticdat-ify small examples so that they can then be passed to
    amplpy team in a more easily understood notebook example with hard coded data.
    the inner dicts returned below can each be passed as an argument to pandas.DataFrame, and from there
    the `set_ampl_data` logic can be broken out explicitly
    :param tdf: a TicDatFactory
    :param dat: a TicDat object created by tdf
    :param field_renamings: the same argument used by copy_to_ampl
    :return:
    '''
    assert utils.dictish(field_renamings) and \
       all(utils.containerish(k) and len(k) == 2 and k[0] in tdf.all_tables and
           k[1] in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]] and
           utils.stringish(v) and v not in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]]
           for k,v in field_renamings.items()), "invalid field_renamings argument"
    dat = tdf.copy_to_pandas(dat, drop_pk_columns=False)

    def do_renames(t, df):
        for f in tdf.primary_key_fields[t] + tdf.data_fields[t]:
            if (t, f) in (field_renamings or []):
                df[field_renamings[t, f]] = df[f]
                df.drop(f, axis=1, inplace=True)
        return df

    rtn = {
        t: do_renames(t,
                      getattr(dat, t).reset_index(drop=True)).to_dict()
        for t in tdf.all_tables
    }
    return rtn
Пример #5
0
 def _xls_write(self, tic_dat, file_path, tbl_name_mapping):
     verify(xlwt,
            "Can't write .xls files because xlwt package isn't installed.")
     tdf = self.tic_dat_factory
     book = xlwt.Workbook()
     for t in sorted(sorted(tdf.all_tables),
                     key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
         sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet])
         for i, f in enumerate(
                 tdf.primary_key_fields.get(t, ()) +
                 tdf.data_fields.get(t, ())):
             sheet.write(0, i, f)
         _t = getattr(tic_dat, t)
         if utils.dictish(_t):
             for row_ind, (p_key, data) in enumerate(_t.items()):
                 for field_ind, cell in enumerate(
                     (p_key if containerish(p_key) else (p_key, )) +
                         tuple(data[_f]
                               for _f in tdf.data_fields.get(t, ()))):
                     sheet.write(row_ind + 1, field_ind, cell)
         else:
             for row_ind, data in enumerate(
                     _t if containerish(_t) else _t()):
                 for field_ind, cell in enumerate(
                         tuple(data[_f] for _f in tdf.data_fields[t])):
                     sheet.write(row_ind + 1, field_ind, cell)
     if os.path.exists(file_path):
         os.remove(file_path)
     book.save(file_path)
Пример #6
0
 def _create_tic_dat_dict(self, xls_file_path, row_offsets,
                          headers_present):
     verify(
         utils.dictish(row_offsets)
         and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
         and all(
             utils.numericish(x) and (x >= 0)
             for x in row_offsets.values()),
         "row_offsets needs to map from table names to non negative row offset"
     )
     row_offsets = dict({t: 0
                         for t in self.tic_dat_factory.all_tables},
                        **row_offsets)
     tdf = self.tic_dat_factory
     rtn = {}
     sheets, field_indicies = self._get_sheets_and_fields(
         xls_file_path,
         set(tdf.all_tables).difference(tdf.generator_tables),
         row_offsets,
         headers_present,
         print_missing_tables=True)
     ho = 1 if headers_present else 0
     for tbl, sheet in sheets.items():
         fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get(
             tbl, ())
         assert fields or tbl in self.tic_dat_factory.generic_tables
         indicies = field_indicies[tbl]
         table_len = min(
             len(sheet.col_values(indicies[field]))
             for field in (fields or indicies))
         if tdf.primary_key_fields.get(tbl, ()):
             tableObj = {
                 self._sub_tuple(tbl, tdf.primary_key_fields[tbl],
                                 indicies)(x):
                 self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                 indicies)(x)
                 for x in (sheet.row_values(i)
                           for i in range(table_len)[row_offsets[tbl] +
                                                     ho:])
             }
         elif tbl in tdf.generic_tables:
             tableObj = [{
                 f: x[i]
                 for f, i in field_indicies[tbl].items()
             } for x in (sheet.row_values(i)
                         for i in range(table_len)[row_offsets[tbl] + ho:])]
         else:
             tableObj = [
                 self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()),
                                 indicies)(x)
                 for x in (sheet.row_values(i)
                           for i in range(table_len)[row_offsets[tbl] +
                                                     ho:])
             ]
         rtn[tbl] = tableObj
     for tbl in tdf.generator_tables:
         rtn[tbl] = self._create_generator_obj(xls_file_path, tbl,
                                               row_offsets[tbl],
                                               headers_present)
     return rtn
Пример #7
0
def _apply_space_case_mapping(tdf, ticdat, mapping):
    """
    :param tdf: a TicDatFactory
    :param ticdat: a ticdat for the tdf
    :param mapping: the mapping returned by an earlier call to _try_create_space_case_mapping.
                    **Should be the value in the {"mapping":mapping} dict, if such a dict was
                      returned, and not the {"mapping":mapping} dict itself.
    :return:
    """
    assert tdf.good_tic_dat_object(
        ticdat), "ticdat not a good object for the tdf"
    assert tu.dictish(mapping)

    def apply_mapping(k):
        if containerish(k):
            return tuple(list(map(apply_mapping, k)))
        return mapping.get(k, k)

    rtn = tdf.copy_tic_dat(ticdat)
    for t in tdf.all_tables:
        if tdf.primary_key_fields.get(t):
            for k, v in getattr(ticdat, t).items():
                del getattr(rtn, t)[k]
                getattr(rtn, t)[apply_mapping(k)] = v
    return rtn
Пример #8
0
 def writeData(insert_spaces):
     import xlwt
     book = xlwt.Workbook()
     for t in tdf.all_tables:
         sheet = book.add_sheet(
             t.replace("_", " " if insert_spaces else "_"))
         for i, f in enumerate(
                 tdf.primary_key_fields.get(t, ()) +
                 tdf.data_fields.get(t, ())):
             sheet.write(0, i, f)
         _t = getattr(ticDat, t)
         containerish = utils.containerish
         if utils.dictish(_t):
             for row_ind, (p_key, data) in enumerate(_t.items()):
                 for field_ind, cell in enumerate(
                     (p_key if containerish(p_key) else (p_key, )) +
                         tuple(data[_f]
                               for _f in tdf.data_fields.get(t, ()))):
                     sheet.write(row_ind + 1, field_ind, cell)
         else:
             for row_ind, data in enumerate(
                     _t if containerish(_t) else _t()):
                 for field_ind, cell in enumerate(
                         tuple(data[_f] for _f in tdf.data_fields[t])):
                     sheet.write(row_ind + 1, field_ind, cell)
     if os.path.exists(filePath):
         os.remove(filePath)
     book.save(filePath)
Пример #9
0
 def _create_tic_dat_dict(self, jdict):
     tdf = self.tic_dat_factory
     rtn = {}
     table_keys = defaultdict(list)
     for t in tdf.all_tables:
         for t2 in jdict:
             if stringish(t2) and t.lower() == t2.replace(" ", "_").lower():
                 table_keys[t].append(t2)
         if len(table_keys[t]) >= 1:
             verify(
                 len(table_keys[t]) < 2,
                 "Found duplicate matching keys for table %s" % t)
             rtn[t] = jdict[table_keys[t][0]]
     orig_rtn, rtn = rtn, {}
     for t, rows in orig_rtn.items():
         all_fields = tdf.primary_key_fields.get(t,
                                                 ()) + tdf.data_fields.get(
                                                     t, ())
         rtn[t] = []
         for row in rows:
             if dictish(row):
                 rtn[t].append({
                     f: tdf._general_read_cell(t, f, x)
                     for f, x in row.items()
                 })
             else:
                 rtn[t].append([
                     tdf._general_read_cell(t, f, x)
                     for f, x in zip(all_fields, row)
                 ])
     return rtn
Пример #10
0
    def _create_jdict(self, path_or_buf):
        if stringish(path_or_buf) and os.path.exists(path_or_buf):
            reasonble_string = path_or_buf
            verify(os.path.isfile(path_or_buf),
                   "json_file_path is not a valid file path.")
            try:
                with open(path_or_buf, "r") as fp:
                    jdict = json.load(fp)
            except Exception as e:
                raise TicDatError("Unable to interpret %s as json file : %s" %
                                  (path_or_buf, e))
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            reasonble_string = path_or_buf[:10]
            try:
                jdict = json.loads(path_or_buf)
            except Exception as e:
                raise TicDatError(
                    "Unable to interpret %s as json string : %s" %
                    (reasonble_string, e))

        verify(dictish(jdict),
               "%s failed to load a dictionary" % reasonble_string)
        verify(
            all(map(stringish, jdict)),
            "The dictionary loaded from %s isn't indexed by strings" %
            reasonble_string)
        verify(
            all(map(containerish, jdict.values())),
            "The dictionary loaded from %s doesn't have containers as values" %
            reasonble_string)
        return jdict
Пример #11
0
 def _create_tic_dat(self, mdbFilePath):
     tdf = self.tic_dat_factory
     table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables)
     rtn = {}
     with _connect(_connection_str(mdbFilePath)) as con:
         for table in set(tdf.all_tables).difference(tdf.generator_tables):
             fields = tdf.primary_key_fields.get(table,
                                                 ()) + tdf.data_fields.get(
                                                     table, ())
             rtn[table] = {} if tdf.primary_key_fields.get(table,
                                                           ()) else []
             with con.cursor() as cur:
                 cur.execute(
                     "Select %s from [%s]" %
                     (", ".join(_brackets(fields)), table_names[table]))
                 for row in cur.fetchall():
                     pk = row[:len(tdf.primary_key_fields.get(table, ()))]
                     data = list(
                         map(
                             _read_data,
                             row[len(tdf.primary_key_fields.get(table, (
                             ))):]))
                     if dictish(rtn[table]):
                         rtn[table][pk[0] if len(pk) ==
                                    1 else tuple(pk)] = data
                     else:
                         rtn[table].append(data)
     for table in tdf.generator_tables:
         rtn[table] = self._create_gen_obj(mdbFilePath, table,
                                           table_names[table])
     return rtn
Пример #12
0
 def _get_data(self, tic_dat, as_sql):
     rtn = []
     for t in self.tic_dat_factory.all_tables:
         _t = getattr(tic_dat, t)
         if dictish(_t):
             primarykeys = tuple(self.tic_dat_factory.primary_key_fields[t])
             for pkrow, sqldatarow in _t.items():
                 _items = list(sqldatarow.items())
                 fields = primarykeys + tuple(x[0] for x in _items)
                 datarow = ((pkrow, ) if len(primarykeys) == 1 else
                            pkrow) + tuple(x[1] for x in _items)
                 assert len(datarow) == len(fields)
                 datarow = tuple(
                     self._write_data_cell(t, f, x)
                     for f, x in zip(fields, datarow))
                 str = "INSERT INTO [%s] (%s) VALUES (%s)" % (t, ",".join(
                     _brackets(fields)), ",".join("%s" if as_sql else "?"
                                                  for _ in fields))
                 if as_sql:
                     rtn.append((str %
                                 tuple(map(_insert_format, datarow))) + ";")
                 else:
                     rtn.append((str, datarow))
         else:
             for sqldatarow in (_t if containerish(_t) else _t()):
                 k, v = zip(*sqldatarow.items())
                 str = "INSERT INTO [%s] (%s) VALUES (%s)"%\
                          (t, ",".join(_brackets(k)), ",".join(
                             ["%s" if as_sql else "?"]*len(sqldatarow)))
                 if as_sql:
                     rtn.append((str % tuple(map(_insert_format, v))) + ";")
                 else:
                     rtn.append((str, v))
     return tuple(rtn)
Пример #13
0
    def _create_tic_dat_from_con(self, con, table_names):
        tdf = self.tic_dat_factory
        rtn = {}
        for table in set(tdf.all_tables).difference(tdf.generator_tables):
            fields = tdf.primary_key_fields.get(table,
                                                ()) + tdf.data_fields.get(
                                                    table, ())
            if not fields:
                assert table in tdf.generic_tables
                fields = tuple(
                    x[1] for x in con.execute("PRAGMA table_info(%s)" % table))
            rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else []
            for row in con.execute(
                    "Select %s from [%s]" %
                (", ".join(_brackets(fields)), table_names[table])):
                if table in tdf.generic_tables:
                    rtn[table].append(
                        {f: _read_data_format(d)
                         for f, d in zip(fields, row)})
                else:
                    pk = row[:len(tdf.primary_key_fields.get(table, ()))]
                    data = list(
                        map(_read_data_format,
                            row[len(tdf.primary_key_fields.get(table, ())):]))

                    if dictish(rtn[table]):
                        rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data
                    else:
                        rtn[table].append(data)
        return rtn
Пример #14
0
 def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping):
     verify(xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed.")
     tdf = self.tic_dat_factory
     if os.path.exists(file_path):
         os.remove(file_path)
     book = xlsx.Workbook(file_path)
     def clean_for_write(t, f, x):
         if self.tic_dat_factory.infinity_io_flag != "N/A" or \
            (t == "parameters" and self.tic_dat_factory.parameters):
             return self.tic_dat_factory._infinity_flag_write_cell(t, f, x)
         if x in [float("inf"), -float("inf")] or isinstance(x, datetime.datetime):
             return str(x)
         return x
     for t in sorted(sorted(tdf.all_tables),
                      key=lambda x: len(tdf.primary_key_fields.get(x, ()))) :
         all_flds = self.tic_dat_factory.primary_key_fields.get(t, ()) + self.tic_dat_factory.data_fields.get(t, ())
         sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet])
         for i,f in enumerate(tdf.primary_key_fields.get(t,()) + tdf.data_fields.get(t, ())) :
             sheet.write(0, i, f)
         _t = getattr(tic_dat, t)
         if utils.dictish(_t) :
             for row_ind, (p_key, data) in enumerate(_t.items()) :
                 for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key,)) +
                                     tuple(data[_f] for _f in tdf.data_fields.get(t, ()))):
                     sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell))
         else :
             for row_ind, data in enumerate(_t if containerish(_t) else _t()) :
                 for field_ind, cell in enumerate(tuple(data[_f] for _f in tdf.data_fields[t])) :
                     sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell))
     book.close()
Пример #15
0
 def write_directory(self,
                     tic_dat,
                     dir_path,
                     allow_overwrite=False,
                     dialect='excel',
                     write_header=True):
     """
     write the ticDat data to a collection of csv files
     :param tic_dat: the data object
     :param dir_path: the directory in which to write the csv files
     :param allow_overwrite: boolean - are we allowed to overwrite existing
                             files?
     :param dialect: the csv dialect. Consult csv documentation for details.
     :param write_header: Boolean. Should the header information be written
                          as the first row?
     :return:
     """
     verify(csv, "csv needs to be installed to use this subroutine")
     verify(dialect in csv.list_dialects(), "Invalid dialect %s" % dialect)
     verify(not os.path.isfile(dir_path),
            "A file is not a valid directory path")
     if self.tic_dat_factory.generic_tables:
         dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory)
         return tdf.csv.write_directory(dat, dir_path, allow_overwrite,
                                        dialect, write_header)
     tdf = self.tic_dat_factory
     msg = []
     if not self.tic_dat_factory.good_tic_dat_object(
             tic_dat, lambda m: msg.append(m)):
         raise TicDatError("Not a valid TicDat object for this schema : " +
                           " : ".join(msg))
     if not allow_overwrite:
         for t in tdf.all_tables:
             f = os.path.join(dir_path, t + ".csv")
             verify(not os.path.exists(f),
                    "The %s path exists and overwrite is not allowed" % f)
     if not os.path.isdir(dir_path):
         os.mkdir(dir_path)
     for t in tdf.all_tables:
         f = os.path.join(dir_path, t + ".csv")
         with open(f, 'w') as csvfile:
             writer = csv.DictWriter(
                 csvfile,
                 dialect=dialect,
                 fieldnames=tdf.primary_key_fields.get(t, ()) +
                 tdf.data_fields.get(t, ()))
             writer.writeheader() if write_header else None
             _t = getattr(tic_dat, t)
             if dictish(_t):
                 for p_key, data_row in _t.items():
                     primaryKeyDict = {
                         f: v
                         for f, v in zip(
                             tdf.primary_key_fields[t],
                             p_key if containerish(p_key) else (p_key, ))
                     }
                     writer.writerow(dict(data_row, **primaryKeyDict))
             else:
                 for data_row in (_t if containerish(_t) else _t()):
                     writer.writerow(dict(data_row))
Пример #16
0
    def set_ampl_data(self, ampl_dat, ampl, table_to_set_name = None):
        """
        performs bulk setData on the AMPL-esque first argument.

        :param ampl_dat: an AmplTicDat object created by calling copy_to_ampl

        :param ampl: an amplpy.AMPL object

        :param table_to_set_name: a mapping of table_name to ampl set name

        :return:
        """
        verify(all(a.startswith("_") or a in self.all_tables for a in dir(ampl_dat)),
               "bad ampl_dat argument")
        verify(hasattr(ampl, "setData"), "bad ampl argument")
        table_to_set_name = table_to_set_name or {}
        verify(dictish(table_to_set_name) and all(hasattr(ampl_dat, k) and
                   utils.stringish(v) for k,v in table_to_set_name.items()),
               "bad table_to_set_name argument")
        for t in set(self.all_tables).intersection(dir(ampl_dat)):
            try:
                ampl.setData(getattr(ampl_dat, t), *([table_to_set_name[t]]
                                                if t in table_to_set_name else []))
            except:
                raise utils.TicDatError(t + " cannot be passed as an argument to AMPL.setData()")
Пример #17
0
 def __init__(self, **init_tables):
     superself._trigger_has_been_used()
     for t in init_tables :
         verify(t in superself.all_tables, "Unexpected table name %s"%t)
         tbl = safe_apply(DataFrame)(init_tables[t])
         if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])):
             tbl = safe_apply(DataFrame)(**init_tables[t])
         verify(isinstance(tbl, DataFrame),
                "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t)
         setattr(self, t, tbl.copy())
         df = getattr(self, t)
         if list(df.columns) == list(range(len(df.columns))) and \
            len(df.columns) >= len(superself._all_fields(t)):
             df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))},
                       inplace=True)
     for t in set(superself.all_tables).difference(init_tables):
         setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)}))
     missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t)
                       if f not in getattr(self, t).columns}
     verify(not missing_fields,
            "The following are (table, field) pairs missing from the data.\n%s"%missing_fields)
     for t in superself.all_tables:
         af = list(superself._all_fields(t))
         df = getattr(self, t)
         if list(df.columns)[:len(af)] != af:
             extra_cols = [_ for _ in list(df.columns) if _ not in af]
             setattr(self, t, df[af + extra_cols])
             assert list(getattr(self, t)) == af + extra_cols
Пример #18
0
    def write_schema(self,
                     engine,
                     schema,
                     forced_field_types=None,
                     include_ancillary_info=True):
        """
        :param engine: typically a sqlalchemy database engine with drivertype postgres (really just needs an .execute)

        :param schema: a string naming the postgres schema to populate (will create if needed)

        :param forced_field_types : A dictionary mappying (table, field) to a field type
                                    Absent forcing, types are inferred from tic_dat_factory.data_types if possible,
                                    and set via the assumption that PK fields are text and data fields are floats if
                                    not.
        :param  include_ancillary_info : boolean. If False, no primary key or foreign key info will be written
        :return:
        """
        self._check_good_pgtd_compatible_table_field_names()
        forced_field_types = forced_field_types or {}
        all_fields = lambda t: self.tdf.primary_key_fields.get(t, (
        )) + self.tdf.data_fields.get(t, ())
        good_forced_field_type_entry = lambda k, v: isinstance(k, tuple) and len(k) == 2 \
                        and k[1] in all_fields(k[0]) and v in \
                        ["text", "integer", "float", "bool", "boolean", "timestamp"]
        verify(
            dictish(forced_field_types) and all(
                good_forced_field_type_entry(k, v)
                for k, v in forced_field_types.items()),
            "bad forced_field_types argument")
        if not include_ancillary_info:
            from ticdat import TicDatFactory
            tdf = TicDatFactory(**{
                t: [[], pks + dfs]
                for t, (pks, dfs) in self.tdf.schema().items()
            })
            for t, dts in self.tdf.data_types.items():
                for f, dt in dts.items():
                    tdf.set_data_type(t, f, *dt)
            forced_field_types_ = {
                (t, f): "text"
                for t, (pks, dfs) in self.tdf.schema().items() for f in pks
                if f not in tdf.data_types.get(t, {})
            }
            forced_field_types_.update(forced_field_types)
            return PostgresTicFactory(tdf).write_schema(
                engine, schema, forced_field_types_)

        verify(not getattr(self.tdf, "generic_tables", None),
               "TicDat for postgres does not yet support generic tables")

        if schema not in [
                row[0] for row in engine.execute(
                    "select schema_name from information_schema.schemata")
        ]:
            engine.execute(sa.schema.CreateSchema(schema))
        for str in self._get_schema_sql(self.tdf.all_tables, schema,
                                        forced_field_types):
            engine.execute(str)
Пример #19
0
 def find_duplicates(self,
                     xls_file_path,
                     row_offsets={},
                     headers_present=True):
     """
     Find the row counts for duplicated rows.
     :param xls_file_path: An Excel file containing sheets whose names match
                           the table names in the schema (non primary key tables ignored).
     :param row_offsets: (optional) A mapping from table names to initial
                         number of rows to skip (non primary key tables ignored)
     :param headers_present: Boolean. Does the first row of data contain the
                             column headers?
     caveats: Missing sheets resolve to an empty table, but missing primary fields
              on matching sheets throw an Exception.
              Sheet names are considered case insensitive.
     :return: A dictionary whose keys are the table names for the primary key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered
              in the table, and the value is the count of records in the
              Excel sheet with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates
     """
     self._verify_differentiable_sheet_names()
     verify(xlrd, "xlrd needs to be installed to use this subroutine")
     verify(
         utils.dictish(row_offsets)
         and set(row_offsets).issubset(self.tic_dat_factory.all_tables)
         and all(
             utils.numericish(x) and (x >= 0)
             for x in row_offsets.values()),
         "row_offsets needs to map from table names to non negative row offset"
     )
     row_offsets = dict({t: 0
                         for t in self.tic_dat_factory.all_tables},
                        **row_offsets)
     tdf = self.tic_dat_factory
     pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _)
     rtn = {t: defaultdict(int) for t in pk_tables}
     sheets, fieldIndicies = self._get_sheets_and_fields(
         xls_file_path, pk_tables, row_offsets, headers_present)
     ho = 1 if headers_present else 0
     for table, sheet in sheets.items():
         fields = tdf.primary_key_fields[table] + tdf.data_fields.get(
             table, ())
         indicies = fieldIndicies[table]
         table_len = min(
             len(sheet.col_values(indicies[field])) for field in fields)
         for x in (sheet.row_values(i)
                   for i in range(table_len)[row_offsets[table] + ho:]):
             rtn[table][self._sub_tuple(table,
                                        tdf.primary_key_fields[table],
                                        indicies)(x)] += 1
     for t in list(rtn.keys()):
         rtn[t] = {k: v for k, v in rtn[t].items() if v > 1}
         if not rtn[t]:
             del (rtn[t])
     return rtn
Пример #20
0
 def _good_inputset(self, inputset, message_writer = lambda x : x):
     if not hasattr(inputset, "schema") and dictish(inputset.schema):
         message_writer("Failed to find dictish schema attribute")
         return False
     if not hasattr(inputset, "getTable") and callable(inputset.getTable):
         message_writer("Failed to find calleable getTable attribute")
         return False
     table_matchings = self._find_table_matchings(inputset)
     badly_matched = {t for t,v in table_matchings.items() if len(v) != 1}
     if badly_matched:
         message_writer("Following tables could not be uniquely resolved in inputset.schema\n%s"%
                        badly_matched)
         return False
     return True
Пример #21
0
    def copy_to_ampl(self, pan_dat, field_renamings = None, excluded_tables = None):
        """
        copies the pan_dat object into a new pan_dat object populated with amplpy.DataFrame objects
        performs a deep copy

        :param pan_dat: a PanDat object

        :param field_renamings: dict or None. If fields are to be renamed in the copy, then
                                a mapping from (table_name, field_name) -> new_field_name
                                If a data field is to be omitted, then new_field can be falsey
                                table_name cannot refer to an excluded table. (see below)
                                field_name doesn't have to refer to a field to an element of
                                self.data_fields[t], but it doesn't have to refer to a column in
                                the pan_dat.table_name DataFrame

        :param excluded_tables: If truthy, a list of tables to be excluded from the copy.
                                Tables without primary key fields are always excluded.

        :return: a deep copy of the tic_dat argument into amplpy.DataFrames
        """
        verify(amplpy, "amplpy needs to be installed in order to enable AMPL functionality")
        msg  = []
        verify(self.good_pan_dat_object(pan_dat, msg.append),
               "pan_dat not a good object for this factory : %s"%"\n".join(msg))
        verify(not excluded_tables or (containerish(excluded_tables) and
                                       set(excluded_tables).issubset(self.all_tables)),
               "bad excluded_tables argument")
        copy_tables = {t for t in self.all_tables if self.primary_key_fields[t]}.\
                      difference(excluded_tables or [])
        field_renamings = field_renamings or {}
        verify(dictish(field_renamings), "invalid field_renamings argument")
        for k,v in field_renamings.items():
            verify(containerish(k) and len(k) == 2 and k[0] in copy_tables and
                   k[1] in getattr(pan_dat, k[0]).columns and
                   ((v and utils.stringish(v)) or (not bool(v) and k[1] not in self.primary_key_fields[k[0]])),
                   "invalid field_renamings argument %s:%s"%(k,v))
        class AmplPanDat(object):
            def __repr__(self):
                return "td:" + tuple(copy_tables).__repr__()
        rtn = AmplPanDat()
        for t in copy_tables:
            rename = lambda f : field_renamings.get((t, f), f)
            df_ampl = amplpy.DataFrame(index=tuple(map(rename, self.primary_key_fields[t])))
            for f in self.primary_key_fields[t]:
                df_ampl.setColumn(rename(f), list(getattr(pan_dat, t)[f]))
            for f in {f for _t,f in field_renamings if _t == t}.union(self.data_fields[t]):
                if rename(f):
                    df_ampl.addColumn(rename(f), list(getattr(pan_dat, t)[f]))
            setattr(rtn, t, df_ampl)
        return rtn
Пример #22
0
 def _create_jdict(self, json_file_path):
     verify(os.path.isfile(json_file_path), "json_file_path is not a valid file path.")
     try :
         with open(json_file_path, "r") as fp:
             jdict = json.load(fp)
     except Exception as e:
         raise TicDatError("Unable to interpret %s as json file : %s"%
                           (json_file_path, e.message))
     verify(dictish(jdict), "%s failed to load a dictionary"%json_file_path)
     verify(all(map(stringish, jdict)),
            "The dictionary loaded from %s isn't indexed by strings"%json_file_path)
     verify(all(map(containerish, jdict.values())),
            "The dictionary loaded from %s doesn't have containers as values"%json_file_path)
     return jdict
Пример #23
0
    def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs):
        """
        Create a PanDat object from a SQLite database file

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
                 +- "inf", "-inf" strings will be converted to +-float("inf")
        """
        if os.path.exists(path_or_buf):
            verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict), "path_or_buf to json.load as a dict")
        verify(all(map(dictish, loaded_dict.values())),
               "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries")

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument")
        rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()}
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields])
        for v in rtn.values():
            v.replace("inf", float("inf"), inplace=True)
            v.replace("-inf", -float("inf"), inplace=True)
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
Пример #24
0
 def _create_tic_dat_from_con(self, con, table_names):
     missing_tables = sorted(
         set(self.tic_dat_factory.all_tables).difference(table_names))
     if missing_tables:
         print(
             "The following table names could not be found in the SQLite database.\n%s\n"
             % "\n".join(missing_tables))
     tdf = self.tic_dat_factory
     rtn = {}
     for table in set(tdf.all_tables).difference(
             tdf.generator_tables).difference(missing_tables):
         fields = tdf.primary_key_fields.get(table,
                                             ()) + tdf.data_fields.get(
                                                 table, ())
         if not fields:
             assert table in tdf.generic_tables
             fields = tuple(
                 x[1] for x in con.execute("PRAGMA table_info(%s)" % table))
         rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else []
         for row in con.execute(
                 "Select %s from [%s]" %
             (", ".join(_brackets(fields)), table_names[table])):
             if table in tdf.generic_tables:
                 rtn[table].append({
                     f: self._read_data_cell(table, f, d)
                     for f, d in zip(fields, row)
                 })
             else:
                 pkfs = tdf.primary_key_fields.get(table, ())
                 pk = tuple(
                     self._read_data_cell(table, f, x)
                     for f, x in zip(pkfs, row[:len(pkfs)]))
                 data = [
                     self._read_data_cell(table, f, x)
                     for f, x in zip(fields[len(pkfs):], row[len(pkfs):])
                 ]
                 if dictish(rtn[table]):
                     rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data
                 else:
                     rtn[table].append(data)
     return rtn
Пример #25
0
    def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping):
        verify(
            xlsx,
            "Can't write .xlsx files because xlsxwriter package isn't installed."
        )
        tdf = self.tic_dat_factory
        if os.path.exists(file_path):
            os.remove(file_path)
        book = xlsx.Workbook(file_path)

        def clean_inf(x):
            if x == float("inf"):
                return "inf"
            if x == -float("inf"):
                return "-inf"
            return x

        for t in sorted(sorted(tdf.all_tables),
                        key=lambda x: len(tdf.primary_key_fields.get(x, ()))):
            sheet = book.add_worksheet(tbl_name_mapping[t])
            for i, f in enumerate(
                    tdf.primary_key_fields.get(t, ()) +
                    tdf.data_fields.get(t, ())):
                sheet.write(0, i, f)
            _t = getattr(tic_dat, t)
            if utils.dictish(_t):
                for row_ind, (p_key, data) in enumerate(_t.items()):
                    for field_ind, cell in enumerate(
                        (p_key if containerish(p_key) else (p_key, )) +
                            tuple(data[_f]
                                  for _f in tdf.data_fields.get(t, ()))):
                        sheet.write(row_ind + 1, field_ind, clean_inf(cell))
            else:
                for row_ind, data in enumerate(
                        _t if containerish(_t) else _t()):
                    for field_ind, cell in enumerate(
                            tuple(data[_f] for _f in tdf.data_fields[t])):
                        sheet.write(row_ind + 1, field_ind, clean_inf(cell))
        book.close()
Пример #26
0
 def _create_tic_dat(self, mdbFilePath):
     tdf = self.tic_dat_factory
     table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables)
     missing_tables = sorted(
         set(self.tic_dat_factory.all_tables).difference(table_names))
     if missing_tables:
         print(
             "The following table names could not be found in the %s Access database.\n%s\n"
             % (mdbFilePath, "\n".join(missing_tables)))
     rtn = {}
     with _connect(_connection_str(mdbFilePath)) as con:
         for table in set(tdf.all_tables).difference(
                 tdf.generator_tables).difference(missing_tables):
             fields = tdf.primary_key_fields.get(table,
                                                 ()) + tdf.data_fields.get(
                                                     table, ())
             rtn[table] = {} if tdf.primary_key_fields.get(table,
                                                           ()) else []
             with con.cursor() as cur:
                 cur.execute(
                     "Select %s from [%s]" %
                     (", ".join(_brackets(fields)), table_names[table]))
                 for row_ in cur.fetchall():
                     row = [
                         tdf._general_read_cell(table, f, x)
                         for f, x in zip(fields, row_)
                     ]
                     pk = row[:len(tdf.primary_key_fields.get(table, ()))]
                     data = row[len(tdf.primary_key_fields.get(table, ())):]
                     if dictish(rtn[table]):
                         rtn[table][pk[0] if len(pk) ==
                                    1 else tuple(pk)] = data
                     else:
                         rtn[table].append(data)
     for table in set(tdf.generator_tables).difference(missing_tables):
         rtn[table] = self._create_gen_obj(mdbFilePath, table,
                                           table_names[table])
     return rtn
Пример #27
0
    def set_default_values(self, **tableDefaults):
        """
        sets the default values for the fields

        :param tableDefaults:
             A dictionary of named arguments. Each argument name (i.e. each key) should be a table name
             Each value should itself be a dictionary mapping data field names to default values

        Ex:

        ```tdf.set_default_values(categories = {"minNutrition":0, "maxNutrition":float("inf")},
                         foods = {"cost":0}, nutritionQuantities = {"qty":0})```

        :return:
        """
        verify(not self._has_been_used,
               "The default values can't be changed after a PanDatFactory has been used.")
        for k,v in tableDefaults.items():
            verify(k in self.all_tables, "Unrecognized table name %s"%k)
            verify(dictish(v) and set(v).issubset(self.data_fields[k] + self.primary_key_fields[k]),
                "Default values for %s should be a dictionary mapping field names to values"
                %k)
            verify(all(utils.acceptable_default(_v) for _v in v.values()), "some default values are unacceptable")
            self._default_values[k] = dict(self._default_values[k], **v)
Пример #28
0
def _deep_anonymize(x)  :
    if not hasattr(x, "__contains__") or utils.stringish(x):
        return x
    if utils.dictish(x) :
        return {_deep_anonymize(k):_deep_anonymize(v) for k,v in x.items()}
    return list(map(_deep_anonymize,x))
Пример #29
0
def assertTicDatTablesSame(t1, t2, _goodTicDatTable,
                           _assertTrue = assertTrue, _assertFalse = assertFalse) :
    _assertTrue(set(t1) == set(t2))
    _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2))
    if not dictish(t1) and not dictish(t2) :
        return
    if dictish(t1) != dictish(t2) and dictish(t2) :
        t1,t2 = t2,t1
    if not dictish(t2) :
        _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values()))
        return
    for k1,v1 in t1.items() :
        v2 = t2[k1]
        if dictish(v1) != dictish(v2) and dictish(v2) :
            v2, v1 = v1, v2
        if dictish(v1) and dictish(v2) :
            _assertTrue(set(v1) == set(v2))
            for _k1 in v1 :
                _assertTrue(v1[_k1] == v2[_k1])
        elif dictish(v1) and containerish(v2) :
            _assertTrue(sorted(v1.values()) == sorted(v2))
        elif dictish(v1) :
            _assertTrue(len(v1) == 1 and v1.values()[0] == v2)
        else :
            if containerish(v1) != containerish(v2) and containerish(v2) :
                v2, v1 = v1, v2
            if containerish(v1) and containerish(v2) :
                _assertTrue(len(v1) == len(v2))
                _assertTrue(all(v1[x] == v2[x] for x in range(len(v1))))
            elif containerish(v1) :
                _assertTrue(len(v1) == 1 and v1[0] == v2)
            else :
                _assertTrue(v1 == v2)
Пример #30
0
    def testSilly(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**sillyMeSchema())
        ticDat = tdf.TicDat(**sillyMeData())
        schema2 = sillyMeSchema()
        schema2["b"][0] = ("bField2", "bField1", "bField3")
        schema3 = sillyMeSchema()
        schema3["a"][1] = ("aData2", "aData3", "aData1")
        schema4 = sillyMeSchema()
        schema4["a"][1] = ("aData1", "aData3")
        schema5 = sillyMeSchema()
        _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, )
        for t in ("a", "b"):
            schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0])
        schema5["a"][0], schema5["b"][0] = (), []
        schema6 = sillyMeSchema()
        schema6["d"] = [["dField"], ()]

        tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x)
                                        for x in (schema2, schema3, schema4,
                                                  schema5, schema6))
        tdf5.set_generator_tables(("a", "c"))
        filePath = os.path.join(_scratchDir, "silly.xls")
        tdf.xls.write_file(ticDat, filePath)

        ticDat2 = tdf2.xls.create_tic_dat(filePath)
        self.assertFalse(tdf._same_data(ticDat, ticDat2))

        ticDat3 = tdf3.xls.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, ticDat3))

        ticDat4 = tdf4.xls.create_tic_dat(filePath)
        for t in ["a", "b"]:
            for k, v in getattr(ticDat4, t).items():
                for _k, _v in v.items():
                    self.assertTrue(getattr(ticDat, t)[k][_k] == _v)
                if set(v) == set(getattr(ticDat, t)[k]):
                    self.assertTrue(t == "b")
                else:
                    self.assertTrue(t == "a")

        ticDat5 = tdf5.xls.create_tic_dat(filePath,
                                          treat_inf_as_infinity=False)
        self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5))
        self.assertTrue(
            callable(ticDat5.a) and callable(ticDat5.c)
            and not callable(ticDat5.b))

        ticDat6 = tdf6.xls.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, ticDat6))
        self.assertTrue(
            firesException(lambda: tdf6._same_data(ticDat, ticDat6)))
        self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d))

        def writeData(data, write_header="same"):
            assert filePath.endswith(".xls")
            assert not write_header or write_header in ("lower", "same",
                                                        "duped")
            import xlwt
            book = xlwt.Workbook()
            for t in tdf.all_tables:
                sheet = book.add_sheet(t)
                if write_header:
                    all_fields = tdf.primary_key_fields.get(
                        t, ()) + tdf.data_fields.get(t, ())
                    for i, f in enumerate(
                        (2 if write_header == "duped" else 1) * all_fields):
                        sheet.write(
                            0, i,
                            f.lower() if write_header == "lower"
                            or i >= len(all_fields) else f)
                for rowInd, row in enumerate(data):
                    for fieldInd, cellValue in enumerate(
                        (2 if write_header == "duped" else 1) * row):
                        sheet.write(rowInd + (1 if write_header else 0),
                                    fieldInd, cellValue)
            if os.path.exists(filePath):
                os.remove(filePath)
            book.save(filePath)
            if write_header in [
                    "lower", "same"
            ]:  # will use pandas to generate the xlsx file version
                file_path_x = filePath + "x"
                if os.path.exists(file_path_x):
                    os.remove(file_path_x)
                writer = utils.pd.ExcelWriter(file_path_x)
                for t, (pks, dfs) in tdf.schema().items():
                    fields = pks + dfs
                    if write_header == "lower":
                        fields = [_.lower() for _ in fields]
                    d = {f: [] for f in fields}
                    for row in data:
                        for f, c in zip(fields, row):
                            d[f].append(c)
                    utils.pd.DataFrame(d).to_excel(writer, t, index=False)
                writer.save()

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)],
                  write_header="duped")
        self.assertTrue(
            self.firesException(
                lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True)))

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)])
        ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
        self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40)
        for f in [filePath, filePath + "x"]:
            rowCount = tdf.xls.find_duplicates(f)
            self.assertTrue(
                set(rowCount) == {'a'} and set(rowCount["a"]) == {1}
                and rowCount["a"][1] == 2)

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)],
                  write_header="lower")
        ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
        self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40)
        for f in [filePath, filePath + "x"]:
            rowCount = tdf.xls.find_duplicates(f)
            self.assertTrue(
                set(rowCount) == {'a'} and set(rowCount["a"]) == {1}
                and rowCount["a"][1] == 2)

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)],
                  write_header=False)
        self.assertTrue(
            self.firesException(
                lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True)))
        ticDatMan = tdf.xls.create_tic_dat(filePath,
                                           freeze_it=True,
                                           headers_present=False)
        self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
        self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40)
        rowCount = tdf.xls.find_duplicates(filePath, headers_present=False)
        self.assertTrue(
            set(rowCount) == {'a'} and set(rowCount["a"]) == {1}
            and rowCount["a"][1] == 2)

        ticDat.a["theboger"] = (1, None, 12)
        tdf.xls.write_file(ticDat, filePath, allow_overwrite=True)
        ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True)
        # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here.
        # not sure how to handle this, but documenting for now.
        self.assertFalse(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "")
        # the workaround for this flaw is to set the data type to be nullabe but not allow the empty string
        tdfwa = TicDatFactory(**sillyMeSchema())
        tdfwa.set_data_type("a", "aData2", nullable=True)
        ticDatNone = tdfwa.xls.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)

        # checking the same thing with .xlsx - using openpyxl, None is indeed recovered even without tdfwa munging!
        tdf.xls.write_file(ticDat, filePath + "x", allow_overwrite=True)
        ticDatNone = tdf.xls.create_tic_dat(filePath + "x", freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
        ticDatNone = tdfwa.xls.create_tic_dat(filePath + "x", freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40),
                   (1, 20, 30, 12)])
        for f in [filePath, filePath + "x"]:
            rowCount = tdf.xls.find_duplicates(f)
            self.assertTrue(
                set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1}
                and rowCount["a"][1] == 3)
            self.assertTrue(
                set(rowCount["b"]) == {(1, 20, 30)}
                and rowCount["b"][1, 20, 30] == 2)
Пример #31
0
def assertTicDatTablesSame(t1, t2, _goodTicDatTable,
                           _assertTrue = assertTrue, _assertFalse = assertFalse) :
    _assertTrue(set(t1) == set(t2))
    _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2))
    if not dictish(t1) and not dictish(t2) :
        return
    if dictish(t1) != dictish(t2) and dictish(t2) :
        t1,t2 = t2,t1
    if not dictish(t2) :
        _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values()))
        return
    for k1,v1 in t1.items() :
        v2 = t2[k1]
        if dictish(v1) != dictish(v2) and dictish(v2) :
            v2, v1 = v1, v2
        if dictish(v1) and dictish(v2) :
            _assertTrue(set(v1) == set(v2))
            for _k1 in v1 :
                _assertTrue(v1[_k1] == v2[_k1])
        elif dictish(v1) and containerish(v2) :
            _assertTrue(sorted(map(str, v1.values())) == sorted(map(str, v2)))
        elif dictish(v1) :
            _assertTrue(len(v1) == 1 and v1.values()[0] == v2)
        else :
            if containerish(v1) != containerish(v2) and containerish(v2) :
                v2, v1 = v1, v2
            if containerish(v1) and containerish(v2) :
                _assertTrue(len(v1) == len(v2))
                _assertTrue(all(v1[x] == v2[x] for x in range(len(v1))))
            elif containerish(v1) :
                _assertTrue(len(v1) == 1 and v1[0] == v2)
            else :
                _assertTrue(v1 == v2)
Пример #32
0
        def doTest(headersPresent) :
            tdf = TicDatFactory(**sillyMeSchema())
            ticDat = tdf.TicDat(**sillyMeData())
            schema2 = sillyMeSchema()
            schema2["b"][0] = ("bField2", "bField1", "bField3")
            schema3 = sillyMeSchema()
            schema3["a"][1] = ("aData2", "aData3", "aData1")
            schema4 = sillyMeSchema()
            schema4["a"][1] = ("aData1", "aData3")
            schema5 = sillyMeSchema()
            _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,)
            for t in ("a", "b") :
                schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0])
            schema5["a"][0], schema5["b"][0] = (), []
            schema5b = sillyMeSchema()
            for t in ("a", "b") :
                schema5b[t][1] = _tuple(schema5b[t][0]) + _tuple(schema5b[t][1])
            schema5b["a"][0], schema5b["b"][0] = (), []
            schema6 = sillyMeSchema()
            schema6["d"] = [("dField",),[]]

            tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6 = (TicDatFactory(**x) for x in
                            (schema2, schema3, schema4, schema5, schema5b, schema6))
            tdf5.set_generator_tables(["a", "c"])
            tdf5b.set_generator_tables(("a", "c"))


            dirPath = makeCleanDir(os.path.join(_scratchDir, "silly"))
            tdf.csv.write_directory(ticDat, dirPath, write_header=headersPresent)

            ticDat2 = tdf2.csv.create_tic_dat(dirPath, headers_present=headersPresent)
            (self.assertFalse if headersPresent else self.assertTrue)(tdf._same_data(ticDat, ticDat2))

            ticDat3 = tdf3.csv.create_tic_dat(dirPath, headers_present=headersPresent)
            (self.assertTrue if headersPresent else self.assertFalse)(tdf._same_data(ticDat, ticDat3))

            if headersPresent :
                ticDat4 = tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent)
                for t in ("a", "b") :
                    for k,v in getattr(ticDat4, t).items() :
                        for _k, _v in v.items() :
                            self.assertTrue(getattr(ticDat, t)[k][_k] == _v)
                        if set(v) == set(getattr(ticDat, t)[k]) :
                            self.assertTrue(t == "b")
                        else :
                            self.assertTrue(t == "a")
            else :
                self.assertTrue(self.firesException(lambda :
                                    tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent)))

            ticDat5 = tdf5.csv.create_tic_dat(dirPath, headers_present=headersPresent)
            (self.assertTrue if headersPresent else self.assertFalse)(
                                                    tdf5._same_data(tdf._keyless(ticDat), ticDat5))
            self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b))

            ticDat5b = tdf5b.csv.create_tic_dat(dirPath, headers_present=headersPresent)
            self.assertTrue(tdf5b._same_data(tdf._keyless(ticDat), ticDat5b))
            self.assertTrue(callable(ticDat5b.a) and callable(ticDat5b.c) and not callable(ticDat5b.b))


            ticDat6 = tdf6.csv.create_tic_dat(dirPath, headers_present=headersPresent)
            self.assertTrue(tdf._same_data(ticDat, ticDat6))
            self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6)))
            self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d))
            allDataTdf = TicDatFactory(**{t:[[], tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())]
                             for t in tdf.all_tables})

            def writeData(data):
                td = allDataTdf.TicDat(a = data, b=data, c=data)
                allDataTdf.csv.write_directory(td, dirPath, allow_overwrite=True, write_header=headersPresent)

            writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)])
            ticDatMan = tdf.csv.create_tic_dat(dirPath, headers_present=headersPresent, freeze_it=True)
            self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
            self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40)
            rowCount = tdf.csv.get_duplicates(dirPath, headers_present= headersPresent)
            self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2)


            writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)])
            rowCount = tdf.csv.get_duplicates(dirPath, headers_present=headersPresent)
            self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3)
            self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)
Пример #33
0
    def create_pan_dat(self,
                       path_or_buf,
                       fill_missing_fields=False,
                       orient='split',
                       **kwargs):
        """
        Create a PanDat object from a JSON file or string

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always resolve to an empty table.

                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.

                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to json and then recover it, the type of data might change.
        Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros.
        To address this, you need to either use set_data_type for your
        PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better).
        """
        if stringish(path_or_buf) and os.path.exists(path_or_buf):
            verify(
                os.path.isfile(path_or_buf),
                "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict),
               "the json.load result doesn't resolve to a dictionary")
        verify(
            all(map(dictish, loaded_dict.values())),
            "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries"
        )

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs,
               "orient should be passed as a non-kwargs argument")
        rtn = {}
        for t, f in tbl_names.items():
            kwargs_ = dict(kwargs)
            if "dtype" not in kwargs_:
                kwargs_[
                    "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t)
            rtn[t] = pd.read_json(json.dumps(loaded_dict[f]),
                                  orient=orient,
                                  **kwargs_)
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following (table, field) pairs are missing fields.\n%s" %
            [(t, f) for t, f in missing_fields])
        missing_tables = sorted(
            set(self.pan_dat_factory.all_tables).difference(rtn))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True)
Пример #34
0
    def testSilly(self):
        tdf = TicDatFactory(**sillyMeSchema())
        ticDat = tdf.TicDat(**sillyMeData())
        schema2 = sillyMeSchema()
        schema2["b"][0] = ("bField2", "bField1", "bField3")
        schema3 = sillyMeSchema()
        schema3["a"][1] = ("aData2", "aData3", "aData1")
        schema4 = sillyMeSchema()
        schema4["a"][1] = ("aData1", "aData3")
        schema5 = sillyMeSchema()
        _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,)
        for t in ("a", "b") :
            schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0])
        schema5["a"][0], schema5["b"][0] =  (),  []
        schema6 = sillyMeSchema()
        schema6["d"] =  [["dField"],()]

        tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6))
        tdf5.set_generator_tables(("a","c"))
        filePath = os.path.join(_scratchDir, "silly.xls")
        tdf.xls.write_file(ticDat, filePath)

        ticDat2 = tdf2.xls.create_tic_dat(filePath)
        self.assertFalse(tdf._same_data(ticDat, ticDat2))

        ticDat3 = tdf3.xls.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, ticDat3))

        ticDat4 = tdf4.xls.create_tic_dat(filePath)
        for t in ["a","b"]:
            for k,v in getattr(ticDat4, t).items() :
                for _k, _v in v.items() :
                    self.assertTrue(getattr(ticDat, t)[k][_k] == _v)
                if set(v) == set(getattr(ticDat, t)[k]) :
                    self.assertTrue(t == "b")
                else :
                    self.assertTrue(t == "a")

        ticDat5 = tdf5.xls.create_tic_dat(filePath)
        self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5))
        self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b))

        ticDat6 = tdf6.xls.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, ticDat6))
        self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6)))
        self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d))

        def writeData(data, write_header = True):
            import xlwt
            book = xlwt.Workbook()
            for t in tdf.all_tables :
                sheet = book.add_sheet(t)
                if write_header :
                    for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) :
                        sheet.write(0, i, f)
                for rowInd, row in enumerate(data) :
                    for fieldInd, cellValue in enumerate(row):
                        sheet.write(rowInd+ (1 if write_header else 0), fieldInd, cellValue)
            if os.path.exists(filePath):
                os.remove(filePath)
            book.save(filePath)

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)])
        ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
        self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40)
        rowCount = tdf.xls.get_duplicates(filePath)
        self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2)

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False)
        self.assertTrue(self.firesException(lambda  : tdf.xls.create_tic_dat(filePath, freeze_it=True)))
        ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False)
        self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3)
        self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40)
        rowCount = tdf.xls.get_duplicates(filePath, headers_present=False)
        self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2)

        ticDat.a["theboger"] = (1, None, 12)
        tdf.xls.write_file(ticDat, filePath, allow_overwrite=True)
        ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True)
        # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here.
        # not sure how to handle this, but documenting for now.
        self.assertFalse(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "")

        writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)])
        rowCount = tdf.xls.get_duplicates(filePath)
        self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3)
        self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)