def get_table_as_mod_text(tdf, tbn): p_tbn = prepend + tbn rtn = p_tbn if len(tdf.primary_key_fields[tbn]) > 1: fkr = [] for i in range(len(tdf.primary_key_fields[tbn])): pk = tdf.primary_key_fields[tbn][i] fk = list( filter( lambda k: k.native_table == tbn and k.mapping. native_field == pk, tdf.foreign_keys)) verify( len(fk) == 1, "Table '%s' needs to fully link it's primary key fields to parent tables via" " foreign keys." % tbn) fkr.append(prepend + fk[0].foreign_table) rtn += '(' + ','.join(fkr) + ')' rtn += ':' fields = [] for df in tdf.data_fields[tbn]: df_m = p_tbn + '_' + df.replace(' ', '_').lower() fields.append(df_m) rtn += ','.join(fields) rtn += ';\n' return rtn
def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :return: """ _standard_verify(self.tic_dat_factory) verify(not (os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
def write_data(self, pan_dat, engine, schema, pre_existing_rows=None): ''' write the PanDat data to a postgres database :param pan_dat: a PanDat object :param engine: A sqlalchemy connection to the PostGres database :param schema: The postgres schema to write to (call self.write_schema explicitly as needed) :param pre_existing_rows: if provided, a dict mapping table name to either "delete" or "append" default behavior is "delete" :return: ''' self._check_good_pgtd_compatible_table_field_names() msg = [] verify( self.tdf.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) self.check_tables_fields( engine, schema, error_on_missing_table=True) # call self.write_schema as needed self._handle_prexisting_rows(engine, schema, pre_existing_rows or {}) pan_dat = self.tdf._pre_write_adjustment(pan_dat) for table in self._ordered_tables(): df = getattr(pan_dat, table).copy(deep=True) fields = self.tdf.primary_key_fields.get( table, ()) + self.tdf.data_fields.get(table, ()) df.rename(columns={f: _pg_name(f) for f in fields}, inplace=True) df.to_sql(name=table, schema=schema, con=engine, if_exists="append", index=False)
def _create_jdict(self, path_or_buf): if stringish(path_or_buf) and os.path.exists(path_or_buf): reasonble_string = path_or_buf verify(os.path.isfile(path_or_buf), "json_file_path is not a valid file path.") try: with open(path_or_buf, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s" % (path_or_buf, e)) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) reasonble_string = path_or_buf[:10] try: jdict = json.loads(path_or_buf) except Exception as e: raise TicDatError( "Unable to interpret %s as json string : %s" % (reasonble_string, e)) verify(dictish(jdict), "%s failed to load a dictionary" % reasonble_string) verify( all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings" % reasonble_string) verify( all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values" % reasonble_string) return jdict
def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a directory of csv files. :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to csv and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_csv("something.csv") df2 = pd.read_csv("something.csv") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ verify(os.path.isdir(dir_path), "%s not a directory path" % dir_path) tbl_names = self._get_table_names(dir_path) rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_csv(f, **kwargs_) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s directory.\n%s\n" % (dir_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t, f in missing_fields]) return _clean_pandat_creator(self.pan_dat_factory, rtn)
def find_duplicates(self, engine, schema, active_fld=""): """ Find the row counts for duplicated rows. :param engine: A sqlalchemy Engine object that can connect to our postgres instance :param schema: Name of the schema within the engine's database to use :param active_fld: if provided, a string for a boolean filter field. Must be compliant w PG naming conventions, which are different from ticdat field naming conventions. Typically developer can ignore this argument, designed for expert support. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the postgres table with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ verify(sa, "sqlalchemy needs to be installed to use this subroutine") self._check_good_pgtd_compatible_table_field_names() if not self._duplicate_focused_tdf: return {} return find_duplicates( PostgresTicFactory(self._duplicate_focused_tdf).create_tic_dat( engine, schema, active_fld=active_fld), self._duplicate_focused_tdf)
def __init__(self, model_type='gurobi', model_name="model"): """ or another :param model_type: either gurobi, cplex or xpress :return: a Model object that encapsulates the appropriate engine model object """ engines = {"gurobi": gurobi, "cplex": cplex, "xpress": xpress} verify(model_type in engines, "engine_type needs to be one of 'gurobi', cplex', 'xpress'") verify( not utils.stringish(engines[model_type]), "You need to have the %s package installed to build this model type." % engines[model_type]) env = {"env": gurobi_env()} if model_type == "gurobi" else {} self._core_model = getattr(engines[model_type], { "gurobi": "Model", "cplex": "Model", "xpress": "problem" }[model_type])(model_name, **env) self._model_type = model_type self._sum = ({ "gurobi": lambda: gurobi.quicksum, "cplex": lambda: self.core_model.sum, "xpress": lambda: xpress.Sum }[model_type])()
def find_data_type_failures(self, pan_dat, as_table=True): """ Finds the data type failures for a pandat object :param pan_dat: pandat object :param as_table: boolean - if truthy then the values of the return dictionary will be the data type failure rows themselves. Otherwise will return the boolean Series that indicates which rows have data type failures. :return: A dictionary constructed as follow: The keys are namedtuples with members "table", "field". Each (table,field) pair has data values that are inconsistent with its data type. (table, field) pairs with no data type at all are never part of the returned dictionary. The values are DataFrames that contain the subset of rows that exhibit data failures for this specific table, field pair (or the boolean Series that identifies these rows). """ msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) rtn = {} TableField = clt.namedtuple("TableField", ["table", "field"]) for table, type_row in self._data_types.items(): _table = getattr(pan_dat, table) for field, data_type in type_row.items(): def bad_row(row): data = row[field] # pandas turns None into nan return not data_type.valid_data(None if safe_apply(isnan)(data) else data) where_bad_rows = _table.apply(bad_row, axis=1) if where_bad_rows.any(): rtn[TableField(table, field)] = _table[where_bad_rows].copy() if as_table else where_bad_rows return rtn
def find_data_row_failures(self, pan_dat, as_table=True): """ Finds the data row failures for a ticdat object :param pan_dat: a pandat object :param as_table: boolean - if truthy then the values of the return dictionary will be the predicate failure rows themselves. Otherwise will return the boolean Series that indicates which rows have predicate failures. :return: A dictionary constructed as follows: The keys are namedtuples with members "table", "predicate_name". The values are DataFrames that contain the subset of rows that exhibit data failures for this specific table, predicate pair (or the Series that identifies these rows). """ msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) rtn = {} TPN = clt.namedtuple("TablePredicateName", ["table", "predicate_name"]) for tbl, row_predicates in self._data_row_predicates.items(): for pn, p in row_predicates.items(): _table = getattr(pan_dat, tbl) bad_row = lambda row: not p(row) where_bad_rows =_table.apply(bad_row, axis=1) if where_bad_rows.any(): rtn[TPN(tbl, pn)] = _table[where_bad_rows].copy() if as_table else where_bad_rows return rtn
def write_data(self, pan_dat, engine, schema, pre_existing_rows=None, active_fld=""): ''' write the PanDat data to a postgres database :param pan_dat: a PanDat object :param engine: A sqlalchemy connection to the PostGres database :param schema: The postgres schema to write to (call self.write_schema explicitly as needed) :param pre_existing_rows: if provided, a dict mapping table name to either "delete" or "append" default behavior is "delete" :param active_fld: if provided, a string for a boolean filter field which will be populated with True. Must be compliant w PG naming conventions, which are different from ticdat field naming conventions. Typically developer can ignore this argument, designed for expert support. :return: ''' verify(_pg_name(active_fld) == active_fld, "active_fld needs to be compliant with PG naming conventions") active_field_tables = _active_fld_tables(engine, schema, active_fld) if active_fld else set() self._check_good_pgtd_compatible_table_field_names() msg = [] verify(self.tdf.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" %"\n".join(msg)) self.check_tables_fields(engine, schema, error_on_missing_table=True) # call self.write_schema as needed self._handle_prexisting_rows(engine, schema, pre_existing_rows or {}) pan_dat = self.tdf._pre_write_adjustment(pan_dat) for table in self._ordered_tables(): df = getattr(pan_dat, table).copy(deep=True) fields = self.tdf.primary_key_fields.get(table, ()) + self.tdf.data_fields.get(table, ()) df.rename(columns={f: _pg_name(f) for f in fields}, inplace=True) if table in active_field_tables: df[active_fld] = True df.to_sql(name=table, schema=schema, con=engine, if_exists="append", index=False)
def good_pan_dat_object(self, data_obj, bad_message_handler = lambda x : None): """ determines if an object is a valid PanDat object for this schema :param data_obj: the object to verify :param bad_message_handler: a call back function to receive description of any failure message :return: True if the dataObj can be recognized as a PanDat data object. False otherwise. """ verify(DataFrame and pd, "Need to install pandas") for t in self.all_tables: if not hasattr(data_obj, t) : bad_message_handler(t + " not an attribute.") return False if not isinstance(getattr(data_obj, t), DataFrame): bad_message_handler(t + " is not a DataFrame") return False missing_fields = {(t, f) for t in self.all_tables for f in self.primary_key_fields.get(t, ()) + self.data_fields.get(t, ()) if f not in getattr(data_obj, t).columns} if missing_fields: bad_message_handler("The following are (table, field) pairs missing from the data.\n%s"%missing_fields) return False return True
def create_pan_dat(self, engine, schema, active_fld=""): """ Create a PanDat object from a PostGres connection :param engine: A sqlalchemy connection to the PostGres database :param schema : The name of the schema to read from :param active_fld: if provided, a string for a boolean filter field. Must be compliant w PG naming conventions, which are different from ticdat field naming conventions. Typically developer can ignore this argument, designed for expert support. :return: a PanDat object populated by the matching tables. Missing tables issue a warning and resolve to empty. """ self._check_good_pgtd_compatible_table_field_names() verify(_pg_name(active_fld) == active_fld, "active_fld needs to be compliant with PG naming conventions") missing_tables = self.check_tables_fields(engine, schema) active_fld_tables = _active_fld_tables(engine, schema, active_fld) if active_fld else set() rtn = {} for table in set(self.tdf.all_tables).difference(missing_tables): fields = [(f, _pg_name(f)) for f in self.tdf.primary_key_fields.get(table, ()) + self.tdf.data_fields.get(table, ())] rtn[table] = pd.read_sql(sql=f"Select {', '.join([pgf for f, pgf in fields])} from {schema}.{table}" + (f" where {active_fld} is True" if table in active_fld_tables else ""), con=engine) rtn[table].rename(columns={pgf: f for f, pgf in fields}, inplace=True) rtn = self.tdf.PanDat(**rtn) msg = [] assert self.tdf.good_pan_dat_object(rtn, msg.append), str(msg) return self.tdf._general_post_read_adjustment(rtn, push_parameters_to_be_valid=True)
def _xls_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlwt, "Can't write .xls files because xlwt package isn't installed.") tdf = self.tic_dat_factory book = xlwt.Workbook() for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, cell) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, cell) if os.path.exists(file_path): os.remove(file_path) book.save(file_path)
def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present): verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory rtn = {} sheets, field_indicies = self._get_sheets_and_fields( xls_file_path, set(tdf.all_tables).difference(tdf.generator_tables), row_offsets, headers_present, print_missing_tables=True) ho = 1 if headers_present else 0 for tbl, sheet in sheets.items(): fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get( tbl, ()) assert fields or tbl in self.tic_dat_factory.generic_tables indicies = field_indicies[tbl] table_len = min( len(sheet.col_values(indicies[field])) for field in (fields or indicies)) if tdf.primary_key_fields.get(tbl, ()): tableObj = { self._sub_tuple(tbl, tdf.primary_key_fields[tbl], indicies)(x): self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) } elif tbl in tdf.generic_tables: tableObj = [{ f: x[i] for f, i in field_indicies[tbl].items() } for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:])] else: tableObj = [ self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) ] rtn[tbl] = tableObj for tbl in tdf.generator_tables: rtn[tbl] = self._create_generator_obj(xls_file_path, tbl, row_offsets[tbl], headers_present) return rtn
def write_sql_file(self, tic_dat, sql_file_path, include_schema=False, allow_overwrite=False): """ write the sql for the ticDat data to a text file :param tic_dat: the data object to write :param sql_file_path: the path of the text file to hold the sql statements for the data :param include_schema: boolean - should we write the schema sql first? :param allow_overwrite: boolean - are we allowed to overwrite pre-existing file :return: caveats : float("inf"), float("-inf") are written as "inf", "-inf" (unless infinity_io_flag is being applied). True/False are written as "True", "False" """ verify(sql, "sqlite3 needs to be installed to use this subroutine") verify( allow_overwrite or not os.path.exists(sql_file_path), "The %s path exists and overwrite is not allowed" % sql_file_path) must_schema = set( self.tic_dat_factory.all_tables if include_schema else []) if self.tic_dat_factory.generic_tables: gt = self.tic_dat_factory.generic_tables dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.sql._write_sql_file(dat, sql_file_path, must_schema.union(gt)) return self._write_sql_file(tic_dat, sql_file_path, must_schema)
def _find_foreign_key_failure_rows(self, pan_dat): msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) rtn = {} for fk in self.foreign_keys: native, foreign, mappings, card = fk child = getattr(pan_dat, native).copy(deep=True) # makes sense to deep copy the possibly smaller drop_duplicates slice of the parent table parent = getattr(pan_dat, foreign) _ = 0 while any("_%s_"%_ in c for c in set(parent.columns).union(child.columns)): _ += 1 magic_field = "_%s_"%_ if all(hasattr(mappings, _) for _ in ["native_field", "foreign_field"]): parent = parent.drop_duplicates(mappings.foreign_field, inplace=False).copy(deep=True) parent[mappings.native_field] = parent[mappings.foreign_field] new_index = mappings.native_field else: parent = parent.drop_duplicates([_.foreign_field for _ in mappings], inplace=False).copy(deep=True) for _ in mappings: parent[_.native_field] = parent[_.foreign_field] new_index = [_.native_field for _ in mappings] # sadly a join might knacker the row order, hence the ugliness with magic_field*2 for child parent[magic_field] = True child.insert(0, magic_field*2, range(0, len(child))) parent.set_index(new_index, drop=True, inplace=True) child.set_index(new_index, drop=True, inplace=True) joined = child.join(parent, rsuffix=magic_field) bad_rows = set(joined[joined[magic_field] != True][magic_field*2]) if bad_rows: # for weird reasons I can't totally figure out, need to cast to list rtn[fk] = list(child.apply(lambda row: row[magic_field*2] in bad_rows, axis=1)) return rtn
def create_tic_dat(self, engine, schema, freeze_it=False, active_fld=""): """ Create a TicDat object from a PostGres connection :param engine: A sqlalchemy connection to the PostGres database :param schema : The name of the schema to read from :param freeze_it: boolean. should the returned object be frozen? :param active_fld: if provided, a string for a boolean filter field. Must be compliant w PG naming conventions, which are different from ticdat field naming conventions. Typically developer can ignore this argument, designed for expert support. :return: a TicDat object populated by the matching tables. Missing tables issue a warning and resolve to empty. """ verify(sa, "sqlalchemy needs to be installed to use this subroutine") verify( _pg_name(active_fld) == active_fld, "active_fld needs to be compliant with PG naming conventions") self._check_good_pgtd_compatible_table_field_names() return self._Rtn(freeze_it)( **self._create_tic_dat(engine, schema, active_fld))
def find_duplicates(self, pan_dat, keep="first", as_table=True): """ Find the duplicated rows based on the primary key fields. :param pan_dat: pandat object :param keep: 'first': Treat all duplicated rows as duplicates except for the first occurrence. 'last': Treat all duplicated rows as duplicates except for the last occurrence. False: Treat all duplicated rows as duplicates :param as_table: as_table boolean : if truthy then the values of the return dictionary will be the duplicated rows themselves. Otherwise will return the boolean Series that indicates which rows are duplicated rows. :return: A dictionary whose keys are the table names and whose values are duplicated rows (or the Series that identifies these rows) """ msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) rtn = {} for t in self.all_tables: if self.primary_key_fields.get(t): dups = getattr(pan_dat, t).duplicated(list(self.primary_key_fields[t]), keep=keep) if dups.any(): rtn[t] = getattr(pan_dat, t)[list(dups)] if as_table else dups return rtn
def set_objective(self, expression, sense="minimize"): """ Set the objective for the model. :param expression: A linear or quadratic combination of variables and numbers. Be sure to use Model.sum for summing over iterables. :param sense: Either 'minimize' or 'maximize' :return: None """ verify(sense in ["maximize", "minimize"], "sense needs to be 'maximize' or 'minimize") if self.model_type == "gurobi": self.core_model.setObjective(expression, sense={ "maximize": gurobi.GRB.MAXIMIZE, "minimize": gurobi.GRB.MINIMIZE }[sense]) if self.model_type == "cplex": ({ "maximize": self.core_model.maximize, "minimize": self.core_model.minimize }[sense])(expression) if self.model_type == "xpress": self.core_model.setObjective(expression, sense={ "maximize": xpress.maximize, "minimize": xpress.minimize }[sense])
def _get_file_path(self, dir_path, table): rtn = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)] if os.path.isfile(path) and f.lower().replace(" ", "_") == "%s.csv"%table.lower()] verify(len(rtn) <= 1, "duplicate .csv files found for %s"%table) if rtn: return rtn[0]
def _get_table_names(self, db_file_path, tables): rtn = {} with _connect(_connection_str(db_file_path)) as con: def try_name(name): with con.cursor() as cur: try: cur.execute("Select * from [%s]" % name) except: return False return True for table in tables: rtn[table] = [ t for t in all_underscore_replacements(table) if try_name(t) ] verify( len(rtn[table]) >= 1, "Unable to recognize table %s in MS Access file %s" % (table, db_file_path)) verify( len(rtn[table]) <= 1, "Duplicate tables found for table %s in MS Access file %s" % (table, db_file_path)) rtn[table] = rtn[table][0] return rtn
def _verify_differentiable_sheet_names(self): rtn = defaultdict(set) for t in self.tic_dat_factory.all_tables: rtn[t[:_longest_sheet]].add(t) rtn = [v for k,v in rtn.items() if len(v) > 1] verify(not rtn, "The following tables collide when names are truncated to %s characters.\n%s"% (_longest_sheet, sorted(map(sorted, rtn))))
def _create_tic_dat_dict(self, jdict): tdf = self.tic_dat_factory rtn = {} table_keys = defaultdict(list) for t in tdf.all_tables: for t2 in jdict: if stringish(t2) and t.lower() == t2.replace(" ", "_").lower(): table_keys[t].append(t2) if len(table_keys[t]) >= 1: verify( len(table_keys[t]) < 2, "Found duplicate matching keys for table %s" % t) rtn[t] = jdict[table_keys[t][0]] orig_rtn, rtn = rtn, {} for t, rows in orig_rtn.items(): all_fields = tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get( t, ()) rtn[t] = [] for row in rows: if dictish(row): rtn[t].append({ f: tdf._general_read_cell(t, f, x) for f, x in row.items() }) else: rtn[t].append([ tdf._general_read_cell(t, f, x) for f, x in zip(all_fields, row) ]) return rtn
def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed.") tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_for_write(t, f, x): if self.tic_dat_factory.infinity_io_flag != "N/A" or \ (t == "parameters" and self.tic_dat_factory.parameters): return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) if x in [float("inf"), -float("inf")] or isinstance(x, datetime.datetime): return str(x) return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))) : all_flds = self.tic_dat_factory.primary_key_fields.get(t, ()) + self.tic_dat_factory.data_fields.get(t, ()) sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet]) for i,f in enumerate(tdf.primary_key_fields.get(t,()) + tdf.data_fields.get(t, ())) : sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t) : for row_ind, (p_key, data) in enumerate(_t.items()) : for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key,)) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) else : for row_ind, data in enumerate(_t if containerish(_t) else _t()) : for field_ind, cell in enumerate(tuple(data[_f] for _f in tdf.data_fields[t])) : sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) book.close()
def write_file(self, pan_dat, file_path, case_space_sheet_names=False): """ write the panDat data to an excel file :param pan_dat: the PanDat object to write :param file_path: The file path of the excel file to create :param case_space_sheet_names: boolean - make best guesses how to add spaces and upper case characters to sheet names :return: caveats: The row names (index) isn't written. """ msg = [] verify( self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") case_space_sheet_names = case_space_sheet_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) writer = pd.ExcelWriter(file_path) for t in self.pan_dat_factory.all_tables: getattr(pan_dat, t).to_excel( writer, case_space_to_pretty(t) if case_space_sheet_names else t, index=False) writer.save()
def read_lingo_text(tdf,results_text): """ Read Lingo .ldt strings :param tdf: A TicDatFactory defining the schema :param results_text: A list of strings defining Lingo tables :return: A TicDat object consistent with tdf """ for i in results_text.values(): verify(stringish(i), "text needs to be a string") def _get_as_type(val): try: return float(val) except ValueError: return val dict_with_lists = defaultdict(list) for tbn in results_text: rows = [] text = results_text[tbn].strip().split("\n") for line in text: rows.append(list(map(lambda k: _get_as_type(k),line.strip().split()))) dict_with_lists[tbn] = rows assert not find_duplicates_from_dict_ticdat(tdf, dict_with_lists), \ "duplicates were found - if asserts are disabled, duplicate rows will overwrite" return tdf.TicDat(**{k.replace(tdf.lingo_prepend,"",1):v for k,v in dict_with_lists.items()})
def get_fld_type(t, f, default_type): if (t, f) in forced_field_types: return forced_field_types[t, f] if t == "parameters" and self.tdf.parameters: return "text" fld_type = self.tdf.data_types.get(t, {}).get(f) if not fld_type: return default_type if fld_type.datetime: return "timestamp" verify( not (fld_type.number_allowed and fld_type.strings_allowed), f"Select one of string or numeric for {t}.{f} if declaring type and using postgres" ) if fld_type.strings_allowed: return 'text' if fld_type.number_allowed: if fld_type.must_be_int: return 'integer' else: return 'float' else: TicDatError( f"Allow one of text or numeric for {t}.{f} if declaring type and using postgres" )
def _get_table_names(self, db_file_path, tables): rtn = {} with sql.connect(db_file_path) as con: def try_name(name): try: con.execute("Select * from [%s]" % name) except: return False return True for table in tables: rtn[table] = [ t for t in all_underscore_replacements(table) if try_name(t) ] verify( len(rtn[table]) <= 1, "Duplicate tables found for table %s in SQLite file %s" % (table, db_file_path)) if rtn[table]: rtn[table] = rtn[table][0] else: rtn.pop(table) return rtn
def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a SQLite database file :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify(os.path.isdir(dir_path), "%s not a directory path"%dir_path) tbl_names = self._get_table_names(dir_path) rtn = {t: pd.read_csv(f, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t,f in missing_fields]) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def to_number(st, pos): try: return float(st) except ValueError: verify( False, "Badly formatted string - Field '%s' is not a valid number. Character position [%s]." % (st, pos))