def create_from_full_schema(full_schema): """ create a PanDatFactory complete with default values, data types, and foreign keys :param full_schema: a dictionary consistent with the data returned by a call to schema() with include_ancillary_info = True :return: a PanDatFactory reflecting the tables, fields, default values, data types, and foreign keys consistent with the full_schema argument """ verify(dictish(full_schema) and set(full_schema) == {"tables_fields", "foreign_keys", "default_values", "data_types"}, "full_schema should be the result of calling schema(True) for some PanDatFactory") fks = full_schema["foreign_keys"] verify( (not fks) or (lupish(fks) and all(lupish(_) and len(_) >= 3 for _ in fks)), "foreign_keys entry poorly formed") dts = full_schema["data_types"] verify( (not dts) or (dictish(dts) and all(map(dictish, dts.values())) and all(all(map(lupish, _.values())) for _ in dts.values())), "data_types entry poorly formatted") dvs = full_schema["default_values"] verify( (not dvs) or (dictish(dvs) and all(map(dictish, dvs.values()))), "default_values entry poorly formatted") rtn = PanDatFactory(**full_schema["tables_fields"]) for fk in (fks or []): rtn.add_foreign_key(*fk[:3]) for t,fds in (dts or {}).items(): for f,dt in fds.items(): rtn.set_data_type(t, f, *dt) for t,fdvs in (dvs or {}).items(): for f, dv in fdvs.items(): rtn.set_default_value(t,f,dv) return rtn
def write_data(self, tic_dat, engine, schema, dsn=None, pre_existing_rows=None): """ write the ticDat data to a PostGres database :param tic_dat: the data object to write :param engine: a sqlalchemy database engine with drivertype postgres :param schema: the postgres schema to write to (call self.write_schema explicitly as needed) :param dsn: optional - if truthy, a dict that can be unpacked as arguments to psycopg2.connect. Will speed up bulk writing compared to engine.execute If truthy and not a dict, then will be passed directly to psycopg2.connect as the sole argument. :param pre_existing_rows: if provided, a dict mapping table name to either "delete" or "append" default behavior is "delete" :return: """ verify(sa, "sqalchemy needs to be installed to use this subroutine") verify(engine.name == 'postgresql', "a sqlalchemy engine with drivername='postgres' is required") verify(not dsn or psycopg2, "need psycopg2 to use the faster dsn write option") self._check_good_pgtd_compatible_table_field_names() msg = [] if not self.tdf.good_tic_dat_object(tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not self.tdf.generic_tables, "TicDat for postgres does not yet support generic tables") self.check_tables_fields( engine, schema, error_on_missing_table=True) # call self.write_schema as needed self._handle_prexisting_rows(engine, schema, pre_existing_rows or {}) if dsn: connect_kwargs = dsn if dsn and dictish(dsn) else {} connect_args = [dsn] if dsn and not dictish(dsn) else [] with psycopg2.connect(*connect_args, **connect_kwargs) as db: with db.cursor() as cursor: for k, v in self._get_data(tic_dat, schema, dump_format="dict").items(): psycopg2.extras.execute_values(cursor, k, v) else: all_dat = self._get_data(tic_dat, schema) if len(all_dat) > 1000: print( "***pgtd.py not using most efficient data writing technique**" ) for sql_str, data in all_dat: engine.execute(sql_str, data)
def _assertSame(self, t1, t2, goodTicDatTable): if utils.dictish(t1) or utils.dictish(t2) : _ass = lambda _t1, _t2 : assertTicDatTablesSame(_t1, _t2, _goodTicDatTable= goodTicDatTable, **({} if DEBUG() else {"_assertTrue":self.assertTrue, "_assertFalse":self.assertFalse})) _ass(t1, t2) _ass(t2, t1) else : setify = lambda t : set(t) if len(t) and not hasattr(t[0], "values") else {r.values() for r in t} self.assertTrue(setify(t1) == setify(t2))
def convert_to_dicts_that_can_be_turned_into_DataFrames( tdf, dat, field_renamings=None): ''' utility routine to help de-ticdat-ify small examples so that they can then be passed to amplpy team in a more easily understood notebook example with hard coded data. the inner dicts returned below can each be passed as an argument to pandas.DataFrame, and from there the `set_ampl_data` logic can be broken out explicitly :param tdf: a TicDatFactory :param dat: a TicDat object created by tdf :param field_renamings: the same argument used by copy_to_ampl :return: ''' assert utils.dictish(field_renamings) and \ all(utils.containerish(k) and len(k) == 2 and k[0] in tdf.all_tables and k[1] in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]] and utils.stringish(v) and v not in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]] for k,v in field_renamings.items()), "invalid field_renamings argument" dat = tdf.copy_to_pandas(dat, drop_pk_columns=False) def do_renames(t, df): for f in tdf.primary_key_fields[t] + tdf.data_fields[t]: if (t, f) in (field_renamings or []): df[field_renamings[t, f]] = df[f] df.drop(f, axis=1, inplace=True) return df rtn = { t: do_renames(t, getattr(dat, t).reset_index(drop=True)).to_dict() for t in tdf.all_tables } return rtn
def _xls_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlwt, "Can't write .xls files because xlwt package isn't installed.") tdf = self.tic_dat_factory book = xlwt.Workbook() for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, cell) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, cell) if os.path.exists(file_path): os.remove(file_path) book.save(file_path)
def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present): verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory rtn = {} sheets, field_indicies = self._get_sheets_and_fields( xls_file_path, set(tdf.all_tables).difference(tdf.generator_tables), row_offsets, headers_present, print_missing_tables=True) ho = 1 if headers_present else 0 for tbl, sheet in sheets.items(): fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get( tbl, ()) assert fields or tbl in self.tic_dat_factory.generic_tables indicies = field_indicies[tbl] table_len = min( len(sheet.col_values(indicies[field])) for field in (fields or indicies)) if tdf.primary_key_fields.get(tbl, ()): tableObj = { self._sub_tuple(tbl, tdf.primary_key_fields[tbl], indicies)(x): self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) } elif tbl in tdf.generic_tables: tableObj = [{ f: x[i] for f, i in field_indicies[tbl].items() } for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:])] else: tableObj = [ self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) ] rtn[tbl] = tableObj for tbl in tdf.generator_tables: rtn[tbl] = self._create_generator_obj(xls_file_path, tbl, row_offsets[tbl], headers_present) return rtn
def _apply_space_case_mapping(tdf, ticdat, mapping): """ :param tdf: a TicDatFactory :param ticdat: a ticdat for the tdf :param mapping: the mapping returned by an earlier call to _try_create_space_case_mapping. **Should be the value in the {"mapping":mapping} dict, if such a dict was returned, and not the {"mapping":mapping} dict itself. :return: """ assert tdf.good_tic_dat_object( ticdat), "ticdat not a good object for the tdf" assert tu.dictish(mapping) def apply_mapping(k): if containerish(k): return tuple(list(map(apply_mapping, k))) return mapping.get(k, k) rtn = tdf.copy_tic_dat(ticdat) for t in tdf.all_tables: if tdf.primary_key_fields.get(t): for k, v in getattr(ticdat, t).items(): del getattr(rtn, t)[k] getattr(rtn, t)[apply_mapping(k)] = v return rtn
def writeData(insert_spaces): import xlwt book = xlwt.Workbook() for t in tdf.all_tables: sheet = book.add_sheet( t.replace("_", " " if insert_spaces else "_")) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(ticDat, t) containerish = utils.containerish if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, cell) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, cell) if os.path.exists(filePath): os.remove(filePath) book.save(filePath)
def _create_tic_dat_dict(self, jdict): tdf = self.tic_dat_factory rtn = {} table_keys = defaultdict(list) for t in tdf.all_tables: for t2 in jdict: if stringish(t2) and t.lower() == t2.replace(" ", "_").lower(): table_keys[t].append(t2) if len(table_keys[t]) >= 1: verify( len(table_keys[t]) < 2, "Found duplicate matching keys for table %s" % t) rtn[t] = jdict[table_keys[t][0]] orig_rtn, rtn = rtn, {} for t, rows in orig_rtn.items(): all_fields = tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get( t, ()) rtn[t] = [] for row in rows: if dictish(row): rtn[t].append({ f: tdf._general_read_cell(t, f, x) for f, x in row.items() }) else: rtn[t].append([ tdf._general_read_cell(t, f, x) for f, x in zip(all_fields, row) ]) return rtn
def _create_jdict(self, path_or_buf): if stringish(path_or_buf) and os.path.exists(path_or_buf): reasonble_string = path_or_buf verify(os.path.isfile(path_or_buf), "json_file_path is not a valid file path.") try: with open(path_or_buf, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s" % (path_or_buf, e)) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) reasonble_string = path_or_buf[:10] try: jdict = json.loads(path_or_buf) except Exception as e: raise TicDatError( "Unable to interpret %s as json string : %s" % (reasonble_string, e)) verify(dictish(jdict), "%s failed to load a dictionary" % reasonble_string) verify( all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings" % reasonble_string) verify( all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values" % reasonble_string) return jdict
def _create_tic_dat(self, mdbFilePath): tdf = self.tic_dat_factory table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables) rtn = {} with _connect(_connection_str(mdbFilePath)) as con: for table in set(tdf.all_tables).difference(tdf.generator_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] with con.cursor() as cur: cur.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])) for row in cur.fetchall(): pk = row[:len(tdf.primary_key_fields.get(table, ()))] data = list( map( _read_data, row[len(tdf.primary_key_fields.get(table, ( ))):])) if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) for table in tdf.generator_tables: rtn[table] = self._create_gen_obj(mdbFilePath, table, table_names[table]) return rtn
def _get_data(self, tic_dat, as_sql): rtn = [] for t in self.tic_dat_factory.all_tables: _t = getattr(tic_dat, t) if dictish(_t): primarykeys = tuple(self.tic_dat_factory.primary_key_fields[t]) for pkrow, sqldatarow in _t.items(): _items = list(sqldatarow.items()) fields = primarykeys + tuple(x[0] for x in _items) datarow = ((pkrow, ) if len(primarykeys) == 1 else pkrow) + tuple(x[1] for x in _items) assert len(datarow) == len(fields) datarow = tuple( self._write_data_cell(t, f, x) for f, x in zip(fields, datarow)) str = "INSERT INTO [%s] (%s) VALUES (%s)" % (t, ",".join( _brackets(fields)), ",".join("%s" if as_sql else "?" for _ in fields)) if as_sql: rtn.append((str % tuple(map(_insert_format, datarow))) + ";") else: rtn.append((str, datarow)) else: for sqldatarow in (_t if containerish(_t) else _t()): k, v = zip(*sqldatarow.items()) str = "INSERT INTO [%s] (%s) VALUES (%s)"%\ (t, ",".join(_brackets(k)), ",".join( ["%s" if as_sql else "?"]*len(sqldatarow))) if as_sql: rtn.append((str % tuple(map(_insert_format, v))) + ";") else: rtn.append((str, v)) return tuple(rtn)
def _create_tic_dat_from_con(self, con, table_names): tdf = self.tic_dat_factory rtn = {} for table in set(tdf.all_tables).difference(tdf.generator_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) if not fields: assert table in tdf.generic_tables fields = tuple( x[1] for x in con.execute("PRAGMA table_info(%s)" % table)) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] for row in con.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])): if table in tdf.generic_tables: rtn[table].append( {f: _read_data_format(d) for f, d in zip(fields, row)}) else: pk = row[:len(tdf.primary_key_fields.get(table, ()))] data = list( map(_read_data_format, row[len(tdf.primary_key_fields.get(table, ())):])) if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) return rtn
def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed.") tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_for_write(t, f, x): if self.tic_dat_factory.infinity_io_flag != "N/A" or \ (t == "parameters" and self.tic_dat_factory.parameters): return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) if x in [float("inf"), -float("inf")] or isinstance(x, datetime.datetime): return str(x) return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))) : all_flds = self.tic_dat_factory.primary_key_fields.get(t, ()) + self.tic_dat_factory.data_fields.get(t, ()) sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet]) for i,f in enumerate(tdf.primary_key_fields.get(t,()) + tdf.data_fields.get(t, ())) : sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t) : for row_ind, (p_key, data) in enumerate(_t.items()) : for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key,)) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) else : for row_ind, data in enumerate(_t if containerish(_t) else _t()) : for field_ind, cell in enumerate(tuple(data[_f] for _f in tdf.data_fields[t])) : sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) book.close()
def write_directory(self, tic_dat, dir_path, allow_overwrite=False, dialect='excel', write_header=True): """ write the ticDat data to a collection of csv files :param tic_dat: the data object :param dir_path: the directory in which to write the csv files :param allow_overwrite: boolean - are we allowed to overwrite existing files? :param dialect: the csv dialect. Consult csv documentation for details. :param write_header: Boolean. Should the header information be written as the first row? :return: """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s" % dialect) verify(not os.path.isfile(dir_path), "A file is not a valid directory path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header) tdf = self.tic_dat_factory msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) if not allow_overwrite: for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed" % f) if not os.path.isdir(dir_path): os.mkdir(dir_path) for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") with open(f, 'w') as csvfile: writer = csv.DictWriter( csvfile, dialect=dialect, fieldnames=tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) writer.writeheader() if write_header else None _t = getattr(tic_dat, t) if dictish(_t): for p_key, data_row in _t.items(): primaryKeyDict = { f: v for f, v in zip( tdf.primary_key_fields[t], p_key if containerish(p_key) else (p_key, )) } writer.writerow(dict(data_row, **primaryKeyDict)) else: for data_row in (_t if containerish(_t) else _t()): writer.writerow(dict(data_row))
def set_ampl_data(self, ampl_dat, ampl, table_to_set_name = None): """ performs bulk setData on the AMPL-esque first argument. :param ampl_dat: an AmplTicDat object created by calling copy_to_ampl :param ampl: an amplpy.AMPL object :param table_to_set_name: a mapping of table_name to ampl set name :return: """ verify(all(a.startswith("_") or a in self.all_tables for a in dir(ampl_dat)), "bad ampl_dat argument") verify(hasattr(ampl, "setData"), "bad ampl argument") table_to_set_name = table_to_set_name or {} verify(dictish(table_to_set_name) and all(hasattr(ampl_dat, k) and utils.stringish(v) for k,v in table_to_set_name.items()), "bad table_to_set_name argument") for t in set(self.all_tables).intersection(dir(ampl_dat)): try: ampl.setData(getattr(ampl_dat, t), *([table_to_set_name[t]] if t in table_to_set_name else [])) except: raise utils.TicDatError(t + " cannot be passed as an argument to AMPL.setData()")
def __init__(self, **init_tables): superself._trigger_has_been_used() for t in init_tables : verify(t in superself.all_tables, "Unexpected table name %s"%t) tbl = safe_apply(DataFrame)(init_tables[t]) if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])): tbl = safe_apply(DataFrame)(**init_tables[t]) verify(isinstance(tbl, DataFrame), "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t) setattr(self, t, tbl.copy()) df = getattr(self, t) if list(df.columns) == list(range(len(df.columns))) and \ len(df.columns) >= len(superself._all_fields(t)): df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))}, inplace=True) for t in set(superself.all_tables).difference(init_tables): setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)})) missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t) if f not in getattr(self, t).columns} verify(not missing_fields, "The following are (table, field) pairs missing from the data.\n%s"%missing_fields) for t in superself.all_tables: af = list(superself._all_fields(t)) df = getattr(self, t) if list(df.columns)[:len(af)] != af: extra_cols = [_ for _ in list(df.columns) if _ not in af] setattr(self, t, df[af + extra_cols]) assert list(getattr(self, t)) == af + extra_cols
def write_schema(self, engine, schema, forced_field_types=None, include_ancillary_info=True): """ :param engine: typically a sqlalchemy database engine with drivertype postgres (really just needs an .execute) :param schema: a string naming the postgres schema to populate (will create if needed) :param forced_field_types : A dictionary mappying (table, field) to a field type Absent forcing, types are inferred from tic_dat_factory.data_types if possible, and set via the assumption that PK fields are text and data fields are floats if not. :param include_ancillary_info : boolean. If False, no primary key or foreign key info will be written :return: """ self._check_good_pgtd_compatible_table_field_names() forced_field_types = forced_field_types or {} all_fields = lambda t: self.tdf.primary_key_fields.get(t, ( )) + self.tdf.data_fields.get(t, ()) good_forced_field_type_entry = lambda k, v: isinstance(k, tuple) and len(k) == 2 \ and k[1] in all_fields(k[0]) and v in \ ["text", "integer", "float", "bool", "boolean", "timestamp"] verify( dictish(forced_field_types) and all( good_forced_field_type_entry(k, v) for k, v in forced_field_types.items()), "bad forced_field_types argument") if not include_ancillary_info: from ticdat import TicDatFactory tdf = TicDatFactory(**{ t: [[], pks + dfs] for t, (pks, dfs) in self.tdf.schema().items() }) for t, dts in self.tdf.data_types.items(): for f, dt in dts.items(): tdf.set_data_type(t, f, *dt) forced_field_types_ = { (t, f): "text" for t, (pks, dfs) in self.tdf.schema().items() for f in pks if f not in tdf.data_types.get(t, {}) } forced_field_types_.update(forced_field_types) return PostgresTicFactory(tdf).write_schema( engine, schema, forced_field_types_) verify(not getattr(self.tdf, "generic_tables", None), "TicDat for postgres does not yet support generic tables") if schema not in [ row[0] for row in engine.execute( "select schema_name from information_schema.schemata") ]: engine.execute(sa.schema.CreateSchema(schema)) for str in self._get_schema_sql(self.tdf.all_tables, schema, forced_field_types): engine.execute(str)
def find_duplicates(self, xls_file_path, row_offsets={}, headers_present=True): """ Find the row counts for duplicated rows. :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema (non primary key tables ignored). :param row_offsets: (optional) A mapping from table names to initial number of rows to skip (non primary key tables ignored) :param headers_present: Boolean. Does the first row of data contain the column headers? caveats: Missing sheets resolve to an empty table, but missing primary fields on matching sheets throw an Exception. Sheet names are considered case insensitive. :return: A dictionary whose keys are the table names for the primary key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the Excel sheet with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ self._verify_differentiable_sheet_names() verify(xlrd, "xlrd needs to be installed to use this subroutine") verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _) rtn = {t: defaultdict(int) for t in pk_tables} sheets, fieldIndicies = self._get_sheets_and_fields( xls_file_path, pk_tables, row_offsets, headers_present) ho = 1 if headers_present else 0 for table, sheet in sheets.items(): fields = tdf.primary_key_fields[table] + tdf.data_fields.get( table, ()) indicies = fieldIndicies[table] table_len = min( len(sheet.col_values(indicies[field])) for field in fields) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[table] + ho:]): rtn[table][self._sub_tuple(table, tdf.primary_key_fields[table], indicies)(x)] += 1 for t in list(rtn.keys()): rtn[t] = {k: v for k, v in rtn[t].items() if v > 1} if not rtn[t]: del (rtn[t]) return rtn
def _good_inputset(self, inputset, message_writer = lambda x : x): if not hasattr(inputset, "schema") and dictish(inputset.schema): message_writer("Failed to find dictish schema attribute") return False if not hasattr(inputset, "getTable") and callable(inputset.getTable): message_writer("Failed to find calleable getTable attribute") return False table_matchings = self._find_table_matchings(inputset) badly_matched = {t for t,v in table_matchings.items() if len(v) != 1} if badly_matched: message_writer("Following tables could not be uniquely resolved in inputset.schema\n%s"% badly_matched) return False return True
def copy_to_ampl(self, pan_dat, field_renamings = None, excluded_tables = None): """ copies the pan_dat object into a new pan_dat object populated with amplpy.DataFrame objects performs a deep copy :param pan_dat: a PanDat object :param field_renamings: dict or None. If fields are to be renamed in the copy, then a mapping from (table_name, field_name) -> new_field_name If a data field is to be omitted, then new_field can be falsey table_name cannot refer to an excluded table. (see below) field_name doesn't have to refer to a field to an element of self.data_fields[t], but it doesn't have to refer to a column in the pan_dat.table_name DataFrame :param excluded_tables: If truthy, a list of tables to be excluded from the copy. Tables without primary key fields are always excluded. :return: a deep copy of the tic_dat argument into amplpy.DataFrames """ verify(amplpy, "amplpy needs to be installed in order to enable AMPL functionality") msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) verify(not excluded_tables or (containerish(excluded_tables) and set(excluded_tables).issubset(self.all_tables)), "bad excluded_tables argument") copy_tables = {t for t in self.all_tables if self.primary_key_fields[t]}.\ difference(excluded_tables or []) field_renamings = field_renamings or {} verify(dictish(field_renamings), "invalid field_renamings argument") for k,v in field_renamings.items(): verify(containerish(k) and len(k) == 2 and k[0] in copy_tables and k[1] in getattr(pan_dat, k[0]).columns and ((v and utils.stringish(v)) or (not bool(v) and k[1] not in self.primary_key_fields[k[0]])), "invalid field_renamings argument %s:%s"%(k,v)) class AmplPanDat(object): def __repr__(self): return "td:" + tuple(copy_tables).__repr__() rtn = AmplPanDat() for t in copy_tables: rename = lambda f : field_renamings.get((t, f), f) df_ampl = amplpy.DataFrame(index=tuple(map(rename, self.primary_key_fields[t]))) for f in self.primary_key_fields[t]: df_ampl.setColumn(rename(f), list(getattr(pan_dat, t)[f])) for f in {f for _t,f in field_renamings if _t == t}.union(self.data_fields[t]): if rename(f): df_ampl.addColumn(rename(f), list(getattr(pan_dat, t)[f])) setattr(rtn, t, df_ampl) return rtn
def _create_jdict(self, json_file_path): verify(os.path.isfile(json_file_path), "json_file_path is not a valid file path.") try : with open(json_file_path, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s"% (json_file_path, e.message)) verify(dictish(jdict), "%s failed to load a dictionary"%json_file_path) verify(all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings"%json_file_path) verify(all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values"%json_file_path) return jdict
def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a SQLite database file :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). +- "inf", "-inf" strings will be converted to +-float("inf") """ if os.path.exists(path_or_buf): verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "path_or_buf to json.load as a dict") verify(all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries") tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields]) for v in rtn.values(): v.replace("inf", float("inf"), inplace=True) v.replace("-inf", -float("inf"), inplace=True) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def _create_tic_dat_from_con(self, con, table_names): missing_tables = sorted( set(self.tic_dat_factory.all_tables).difference(table_names)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) tdf = self.tic_dat_factory rtn = {} for table in set(tdf.all_tables).difference( tdf.generator_tables).difference(missing_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) if not fields: assert table in tdf.generic_tables fields = tuple( x[1] for x in con.execute("PRAGMA table_info(%s)" % table)) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] for row in con.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])): if table in tdf.generic_tables: rtn[table].append({ f: self._read_data_cell(table, f, d) for f, d in zip(fields, row) }) else: pkfs = tdf.primary_key_fields.get(table, ()) pk = tuple( self._read_data_cell(table, f, x) for f, x in zip(pkfs, row[:len(pkfs)])) data = [ self._read_data_cell(table, f, x) for f, x in zip(fields[len(pkfs):], row[len(pkfs):]) ] if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) return rtn
def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify( xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed." ) tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_inf(x): if x == float("inf"): return "inf" if x == -float("inf"): return "-inf" return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_worksheet(tbl_name_mapping[t]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) book.close()
def _create_tic_dat(self, mdbFilePath): tdf = self.tic_dat_factory table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables) missing_tables = sorted( set(self.tic_dat_factory.all_tables).difference(table_names)) if missing_tables: print( "The following table names could not be found in the %s Access database.\n%s\n" % (mdbFilePath, "\n".join(missing_tables))) rtn = {} with _connect(_connection_str(mdbFilePath)) as con: for table in set(tdf.all_tables).difference( tdf.generator_tables).difference(missing_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] with con.cursor() as cur: cur.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])) for row_ in cur.fetchall(): row = [ tdf._general_read_cell(table, f, x) for f, x in zip(fields, row_) ] pk = row[:len(tdf.primary_key_fields.get(table, ()))] data = row[len(tdf.primary_key_fields.get(table, ())):] if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) for table in set(tdf.generator_tables).difference(missing_tables): rtn[table] = self._create_gen_obj(mdbFilePath, table, table_names[table]) return rtn
def set_default_values(self, **tableDefaults): """ sets the default values for the fields :param tableDefaults: A dictionary of named arguments. Each argument name (i.e. each key) should be a table name Each value should itself be a dictionary mapping data field names to default values Ex: ```tdf.set_default_values(categories = {"minNutrition":0, "maxNutrition":float("inf")}, foods = {"cost":0}, nutritionQuantities = {"qty":0})``` :return: """ verify(not self._has_been_used, "The default values can't be changed after a PanDatFactory has been used.") for k,v in tableDefaults.items(): verify(k in self.all_tables, "Unrecognized table name %s"%k) verify(dictish(v) and set(v).issubset(self.data_fields[k] + self.primary_key_fields[k]), "Default values for %s should be a dictionary mapping field names to values" %k) verify(all(utils.acceptable_default(_v) for _v in v.values()), "some default values are unacceptable") self._default_values[k] = dict(self._default_values[k], **v)
def _deep_anonymize(x) : if not hasattr(x, "__contains__") or utils.stringish(x): return x if utils.dictish(x) : return {_deep_anonymize(k):_deep_anonymize(v) for k,v in x.items()} return list(map(_deep_anonymize,x))
def assertTicDatTablesSame(t1, t2, _goodTicDatTable, _assertTrue = assertTrue, _assertFalse = assertFalse) : _assertTrue(set(t1) == set(t2)) _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2)) if not dictish(t1) and not dictish(t2) : return if dictish(t1) != dictish(t2) and dictish(t2) : t1,t2 = t2,t1 if not dictish(t2) : _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values())) return for k1,v1 in t1.items() : v2 = t2[k1] if dictish(v1) != dictish(v2) and dictish(v2) : v2, v1 = v1, v2 if dictish(v1) and dictish(v2) : _assertTrue(set(v1) == set(v2)) for _k1 in v1 : _assertTrue(v1[_k1] == v2[_k1]) elif dictish(v1) and containerish(v2) : _assertTrue(sorted(v1.values()) == sorted(v2)) elif dictish(v1) : _assertTrue(len(v1) == 1 and v1.values()[0] == v2) else : if containerish(v1) != containerish(v2) and containerish(v2) : v2, v1 = v1, v2 if containerish(v1) and containerish(v2) : _assertTrue(len(v1) == len(v2)) _assertTrue(all(v1[x] == v2[x] for x in range(len(v1)))) elif containerish(v1) : _assertTrue(len(v1) == 1 and v1[0] == v2) else : _assertTrue(v1 == v2)
def testSilly(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, ) for t in ("a", "b"): schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"], ()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a", "c")) filePath = os.path.join(_scratchDir, "silly.xls") tdf.xls.write_file(ticDat, filePath) ticDat2 = tdf2.xls.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.xls.create_tic_dat(filePath) for t in ["a", "b"]: for k, v in getattr(ticDat4, t).items(): for _k, _v in v.items(): self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]): self.assertTrue(t == "b") else: self.assertTrue(t == "a") ticDat5 = tdf5.xls.create_tic_dat(filePath, treat_inf_as_infinity=False) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue( callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat6 = tdf6.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue( firesException(lambda: tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) def writeData(data, write_header="same"): assert filePath.endswith(".xls") assert not write_header or write_header in ("lower", "same", "duped") import xlwt book = xlwt.Workbook() for t in tdf.all_tables: sheet = book.add_sheet(t) if write_header: all_fields = tdf.primary_key_fields.get( t, ()) + tdf.data_fields.get(t, ()) for i, f in enumerate( (2 if write_header == "duped" else 1) * all_fields): sheet.write( 0, i, f.lower() if write_header == "lower" or i >= len(all_fields) else f) for rowInd, row in enumerate(data): for fieldInd, cellValue in enumerate( (2 if write_header == "duped" else 1) * row): sheet.write(rowInd + (1 if write_header else 0), fieldInd, cellValue) if os.path.exists(filePath): os.remove(filePath) book.save(filePath) if write_header in [ "lower", "same" ]: # will use pandas to generate the xlsx file version file_path_x = filePath + "x" if os.path.exists(file_path_x): os.remove(file_path_x) writer = utils.pd.ExcelWriter(file_path_x) for t, (pks, dfs) in tdf.schema().items(): fields = pks + dfs if write_header == "lower": fields = [_.lower() for _ in fields] d = {f: [] for f in fields} for row in data: for f, c in zip(fields, row): d[f].append(c) utils.pd.DataFrame(d).to_excel(writer, t, index=False) writer.save() writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="duped") self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="lower") ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False) self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.find_duplicates(filePath, headers_present=False) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) ticDat.a["theboger"] = (1, None, 12) tdf.xls.write_file(ticDat, filePath, allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True) # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here. # not sure how to handle this, but documenting for now. self.assertFalse(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "") # the workaround for this flaw is to set the data type to be nullabe but not allow the empty string tdfwa = TicDatFactory(**sillyMeSchema()) tdfwa.set_data_type("a", "aData2", nullable=True) ticDatNone = tdfwa.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) # checking the same thing with .xlsx - using openpyxl, None is indeed recovered even without tdfwa munging! tdf.xls.write_file(ticDat, filePath + "x", allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) ticDatNone = tdfwa.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 20, 30, 12)]) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 3) self.assertTrue( set(rowCount["b"]) == {(1, 20, 30)} and rowCount["b"][1, 20, 30] == 2)
def assertTicDatTablesSame(t1, t2, _goodTicDatTable, _assertTrue = assertTrue, _assertFalse = assertFalse) : _assertTrue(set(t1) == set(t2)) _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2)) if not dictish(t1) and not dictish(t2) : return if dictish(t1) != dictish(t2) and dictish(t2) : t1,t2 = t2,t1 if not dictish(t2) : _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values())) return for k1,v1 in t1.items() : v2 = t2[k1] if dictish(v1) != dictish(v2) and dictish(v2) : v2, v1 = v1, v2 if dictish(v1) and dictish(v2) : _assertTrue(set(v1) == set(v2)) for _k1 in v1 : _assertTrue(v1[_k1] == v2[_k1]) elif dictish(v1) and containerish(v2) : _assertTrue(sorted(map(str, v1.values())) == sorted(map(str, v2))) elif dictish(v1) : _assertTrue(len(v1) == 1 and v1.values()[0] == v2) else : if containerish(v1) != containerish(v2) and containerish(v2) : v2, v1 = v1, v2 if containerish(v1) and containerish(v2) : _assertTrue(len(v1) == len(v2)) _assertTrue(all(v1[x] == v2[x] for x in range(len(v1)))) elif containerish(v1) : _assertTrue(len(v1) == 1 and v1[0] == v2) else : _assertTrue(v1 == v2)
def doTest(headersPresent) : tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema5b = sillyMeSchema() for t in ("a", "b") : schema5b[t][1] = _tuple(schema5b[t][0]) + _tuple(schema5b[t][1]) schema5b["a"][0], schema5b["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [("dField",),[]] tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema5b, schema6)) tdf5.set_generator_tables(["a", "c"]) tdf5b.set_generator_tables(("a", "c")) dirPath = makeCleanDir(os.path.join(_scratchDir, "silly")) tdf.csv.write_directory(ticDat, dirPath, write_header=headersPresent) ticDat2 = tdf2.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertFalse if headersPresent else self.assertTrue)(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)(tdf._same_data(ticDat, ticDat3)) if headersPresent : ticDat4 = tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent) for t in ("a", "b") : for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") else : self.assertTrue(self.firesException(lambda : tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent))) ticDat5 = tdf5.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)( tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat5b = tdf5b.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf5b._same_data(tdf._keyless(ticDat), ticDat5b)) self.assertTrue(callable(ticDat5b.a) and callable(ticDat5b.c) and not callable(ticDat5b.b)) ticDat6 = tdf6.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) allDataTdf = TicDatFactory(**{t:[[], tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())] for t in tdf.all_tables}) def writeData(data): td = allDataTdf.TicDat(a = data, b=data, c=data) allDataTdf.csv.write_directory(td, dirPath, allow_overwrite=True, write_header=headersPresent) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.csv.create_tic_dat(dirPath, headers_present=headersPresent, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40) rowCount = tdf.csv.get_duplicates(dirPath, headers_present= headersPresent) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)]) rowCount = tdf.csv.get_duplicates(dirPath, headers_present=headersPresent) self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3) self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)
def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a JSON file or string :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always resolve to an empty table. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to json and then recover it, the type of data might change. Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). """ if stringish(path_or_buf) and os.path.exists(path_or_buf): verify( os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "the json.load result doesn't resolve to a dictionary") verify( all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries" ) tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t, f in missing_fields]) missing_tables = sorted( set(self.pan_dat_factory.all_tables).difference(rtn)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True)
def testSilly(self): tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) filePath = os.path.join(_scratchDir, "silly.xls") tdf.xls.write_file(ticDat, filePath) ticDat2 = tdf2.xls.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.xls.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.xls.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat6 = tdf6.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) def writeData(data, write_header = True): import xlwt book = xlwt.Workbook() for t in tdf.all_tables : sheet = book.add_sheet(t) if write_header : for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) : sheet.write(0, i, f) for rowInd, row in enumerate(data) : for fieldInd, cellValue in enumerate(row): sheet.write(rowInd+ (1 if write_header else 0), fieldInd, cellValue) if os.path.exists(filePath): os.remove(filePath) book.save(filePath) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.get_duplicates(filePath) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False) self.assertTrue(self.firesException(lambda : tdf.xls.create_tic_dat(filePath, freeze_it=True))) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.get_duplicates(filePath, headers_present=False) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) ticDat.a["theboger"] = (1, None, 12) tdf.xls.write_file(ticDat, filePath, allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True) # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here. # not sure how to handle this, but documenting for now. self.assertFalse(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "") writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)]) rowCount = tdf.xls.get_duplicates(filePath) self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3) self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)