示例#1
0
文件: xls.py 项目: adampkehoe/ticdat
 def _read_cell(x, field):
     dv, dt = self._get_dv_dt(table, field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(
             rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(
             rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(
             table, field, None)  # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime:
         rtn = utils.safe_apply(
             lambda: xlrd.xldate_as_tuple(rtn, datemode))()
         if rtn is not None:
             f = datetime.datetime
             if utils.pd:
                 f = utils.pd.Timestamp
             return f(year=rtn[0],
                      month=rtn[1],
                      day=rtn[2],
                      hour=rtn[3],
                      minute=rtn[4],
                      second=rtn[5])
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
示例#2
0
文件: xls.py 项目: nandi6uc/ticdat
 def _read_cell(x, field):
     # reminder - data fields have a default default of zero, primary keys don't get a default default
     dv = self.tic_dat_factory.default_values.get(table, {}).get(
         field, ["LIST", "NOT", "POSSIBLE"])
     dt = self.tic_dat_factory.data_types.get(table, {}).get(field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(
             rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(
             rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(
             table, field, None)  # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime:
         rtn = utils.safe_apply(
             lambda: xlrd.xldate_as_tuple(rtn, datemode))()
         if rtn is not None:
             f = datetime.datetime
             if utils.pd:
                 f = utils.pd.Timestamp
             return f(year=rtn[0],
                      month=rtn[1],
                      day=rtn[2],
                      hour=rtn[3],
                      minute=rtn[4],
                      second=rtn[5])
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
示例#3
0
 def __init__(self, **init_tables):
     superself._trigger_has_been_used()
     for t in init_tables :
         verify(t in superself.all_tables, "Unexpected table name %s"%t)
         tbl = safe_apply(DataFrame)(init_tables[t])
         if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])):
             tbl = safe_apply(DataFrame)(**init_tables[t])
         verify(isinstance(tbl, DataFrame),
                "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t)
         setattr(self, t, tbl.copy())
         df = getattr(self, t)
         if list(df.columns) == list(range(len(df.columns))) and \
            len(df.columns) >= len(superself._all_fields(t)):
             df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))},
                       inplace=True)
     for t in set(superself.all_tables).difference(init_tables):
         setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)}))
     missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t)
                       if f not in getattr(self, t).columns}
     verify(not missing_fields,
            "The following are (table, field) pairs missing from the data.\n%s"%missing_fields)
     for t in superself.all_tables:
         af = list(superself._all_fields(t))
         df = getattr(self, t)
         if list(df.columns)[:len(af)] != af:
             extra_cols = [_ for _ in list(df.columns) if _ not in af]
             setattr(self, t, df[af + extra_cols])
             assert list(getattr(self, t)) == af + extra_cols
示例#4
0
文件: xls.py 项目: austin-bren/ticdat
 def xldate_as_tuple_munge(self, x): # only needed for xlrd
     rtn = utils.safe_apply(lambda: xlrd.xldate_as_tuple(x, self._datemode))()
     if rtn is not None:
         f = datetime.datetime
         if utils.pd:
             f = utils.pd.Timestamp
         return f(year=rtn[0], month=rtn[1], day=rtn[2], hour=rtn[3], minute=rtn[4], second=rtn[5])
示例#5
0
 def _convert_float(x, field):
     rtn = x[field_indicies[field]]
     if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and \
        table in data_types and field in data_types[table] and \
        data_types[table][field].must_be_int:
         return int(rtn)
     return rtn
示例#6
0
 def data_type(t, f):
     def_ = default_(t, f)
     if numericish(def_):
         if safe_apply(int)(def_) == def_:
             return "INT"
         return "FLOAT"
     # the TEXT data type doesn't seem to have much value for my purposes.
     return ""
示例#7
0
 def data_type(t, f):
     if t == "parameters" and self.tic_dat_factory.parameters:
         return ""  # the TEXT data type doesn't seem to have much value for my purposes.
     def_ = default_(t, f)
     if numericish(def_):
         if safe_apply(int)(def_) == def_:
             return "INT"
         return "FLOAT"
     return ""  # the TEXT data type doesn't seem to have much value for my purposes.
示例#8
0
    def test_datetime(self):
        core_path = os.path.join(_scratchDir, "parameters")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        pdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        pdf.add_parameter("p2", None, datetime=True, nullable=True)
        pdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None],
                               [
                                   datetime.datetime.now(),
                                   dateutil.parser.parse("Sept 11 2011")
                               ]],
            parameters=[["p1", "7/11/1911"], ["p2", None]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(
            pdf.find_data_type_failures(dat)
            or pdf.find_data_row_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            self.assertFalse(pdf._same_data(dat, dat_1))
            self.assertFalse(
                pdf.find_data_type_failures(dat_1)
                or pdf.find_data_row_failures(dat_1))
            dat_1 = pdf.copy_to_tic_dat(dat_1)
            self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'})
            self.assertTrue(
                isinstance(dat_1.parameters["p1"]["b"],
                           (datetime.datetime, numpy.datetime64))
                and not pd.isnull(dat_1.parameters["p1"]["b"]))
            self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"]))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime,
                                   numpy.datetime64)) and not pd.isnull(_)
                    for _ in dat_1.table_with_stuffs))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime, numpy.datetime64))
                    or _ is None or utils.safe_apply(math.isnan)(_)
                    for v in dat_1.table_with_stuffs.values()
                    for _ in v.values()))
            self.assertTrue({
                pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()
            } == {True, False})
示例#9
0
 def setUp(self):
     try:
         self.postgresql = testing_postgresql.Postgresql()
         self.engine = sa.create_engine(self.postgresql.url())
         self.engine_fail = None
     except Exception as e:
         self.postgresql = self.engine = None
         self.engine_fail = e
     if self.engine_fail:
         print(f"!!!!Engine failed to load due to {self.engine_fail}")
     if self.engine:
         for test_schema in test_schemas:
             if utils.safe_apply(lambda: test_schema in sa.inspect(self.engine).get_schema_names())():
                 self.engine.execute(sa.schema.DropSchema(test_schema, cascade=True))
示例#10
0
文件: xls.py 项目: austin-bren/ticdat
 def _read_cell(x, field):
     dv, dt = self._get_dv_dt(table, field)
     rtn = x[field_indicies[field]]
     if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)):
         return None
     if treat_inf_as_infinity and utils.stringish(rtn) and rtn.lower() in ["inf", "-inf"]:
         return float(rtn.lower())
     if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and dt and dt.must_be_int:
         rtn = int(rtn)
     if rtn == "":
         try_rtn = self.tic_dat_factory._general_read_cell(table, field, None) # None as infinity flagging
         if utils.numericish(try_rtn):
             return try_rtn
     if utils.numericish(rtn) and dt and dt.datetime and hasattr(sheet, "xldate_as_tuple_munge"):
         rtn = sheet.xldate_as_tuple_munge(rtn)
     return self.tic_dat_factory._general_read_cell(table, field, rtn)
示例#11
0
 def _write_data_cell(self, t, f, x):
     rtn = self.tdf._infinity_flag_write_cell(t, f, x)
     if numericish(rtn):
         rtn = float(rtn) if safe_apply(int)(rtn) != rtn else int(rtn)
     return rtn
示例#12
0
文件: xls.py 项目: austin-bren/ticdat
    def create_tic_dat(self, xls_file_path, row_offsets=None, headers_present = True,
                       treat_inf_as_infinity = True,
                       freeze_it = False):
        """
        Create a TicDat object from an Excel file

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.

        :param row_offsets: (optional) A mapping from table names to initial
                            number of rows to skip

        :param headers_present: Boolean. Does the first row of data contain the
                                column headers?

        :param treat_inf_as_infinity: Boolean. Treat the "inf" string (case insensitive) as
                                               as infinity. Similar for "-inf"

        :param freeze_it: boolean. should the returned object be frozen?

        :return: a TicDat object populated by the matching sheets.

        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception.
                 Sheet names are considered case insensitive, and white space is replaced
                 with underscore for table name matching.
                 Field names are considered case insensitive, but white space is respected.
                 (ticdat supports whitespace in field names but not table names).
                 The following two caveats apply only if data_types are used.
                 --> Any field for which an empty string is invalid data and None is
                     valid data will replace the empty string with None.
                 --> Any field for which must_be_int is true will replace numeric
                     data that satisfies int(x)==x with int(x). In other words,
                     the ticdat equivalent of pandas.read_excel convert_float
                     is to set must_be_int to true in data_types.
        """
        self._verify_differentiable_sheet_names()
        verify(utils.safe_apply(os.path.isfile)(xls_file_path), f"{xls_file_path} not a file path")
        verify(xls_file_path.endswith(".xls") or _is_openpyxl_ext(xls_file_path),
               f"invalid file extension for {xls_file_path}")
        if xls_file_path.endswith(".xls"):
            verify(xlrd, "xlrd needs to be installed to use this subroutine")
        else:
            verify(openpyxl, "openpyxl needs to be installed to use this subroutine")
        tdf = self.tic_dat_factory
        verify(not(treat_inf_as_infinity and tdf.generator_tables),
               "treat_inf_as_infinity not implemented for generator tables")
        verify(headers_present or not tdf.generic_tables,
               "headers need to be present to read generic tables")
        verify(utils.DataFrame or not tdf.generic_tables,
               "Strange absence of pandas despite presence of generic tables")
        if self.tic_dat_factory.generic_tables:
            verify(headers_present and treat_inf_as_infinity and not row_offsets,
                   "headers_present, treat_inf_as_infinity and row_offsets must all be at default values\n" +
                   "to use generic tables")
        rtn = self._create_tic_dat_dict(xls_file_path, row_offsets or {}, headers_present, treat_inf_as_infinity)
        if self.tic_dat_factory.generic_tables:
            if xls_file_path.endswith(".xls"):
                print("** Warning : pandas doesn't always play well with older Excel formats.")
            pdf = PanDatFactory(**{t: '*' for t in self.tic_dat_factory.generic_tables})
            pandat = pdf.xls.create_pan_dat(xls_file_path)
            for t in self.tic_dat_factory.generic_tables:
                rtn[t] = getattr(pandat, t)
        rtn = tdf._parameter_table_post_read_adjustment(tdf.TicDat(**rtn))
        if freeze_it:
            return self.tic_dat_factory.freeze_me(rtn)
        return rtn
示例#13
0
 def bad_row(row):
     data = row[field]
     # pandas turns None into nan
     return not data_type.valid_data(None if safe_apply(isnan)(data) else data)