Exemplo n.º 1
0
 def __init__(self, **init_tables):
     superself._trigger_has_been_used()
     for t in init_tables :
         verify(t in superself.all_tables, "Unexpected table name %s"%t)
         tbl = safe_apply(DataFrame)(init_tables[t])
         if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])):
             tbl = safe_apply(DataFrame)(**init_tables[t])
         verify(isinstance(tbl, DataFrame),
                "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t)
         setattr(self, t, tbl.copy())
         df = getattr(self, t)
         if list(df.columns) == list(range(len(df.columns))) and \
            len(df.columns) >= len(superself._all_fields(t)):
             df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))},
                       inplace=True)
     for t in set(superself.all_tables).difference(init_tables):
         setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)}))
     missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t)
                       if f not in getattr(self, t).columns}
     verify(not missing_fields,
            "The following are (table, field) pairs missing from the data.\n%s"%missing_fields)
     for t in superself.all_tables:
         af = list(superself._all_fields(t))
         df = getattr(self, t)
         if list(df.columns)[:len(af)] != af:
             extra_cols = [_ for _ in list(df.columns) if _ not in af]
             setattr(self, t, df[af + extra_cols])
             assert list(getattr(self, t)) == af + extra_cols
Exemplo n.º 2
0
    def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs):
        """
        Create a PanDat object from a directory of csv files.

        :param db_file_path: the directory containing the .csv files.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param kwargs: additional named arguments to pass to pandas.read_csv

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to csv and then recover it, the type of data might change. For example

            df = pd.DataFrame({"a":["100", "200", "300"]})
            df.to_csv("something.csv")
            df2 = pd.read_csv("something.csv")

        results in a numeric column in df2. To address this, you need to either use set_data_type for your
        PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better).

        This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]})
        """
        verify(os.path.isdir(dir_path), "%s not a directory path" % dir_path)
        tbl_names = self._get_table_names(dir_path)
        rtn = {}
        for t, f in tbl_names.items():
            kwargs_ = dict(kwargs)
            if "dtype" not in kwargs_:
                kwargs_[
                    "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t)
            rtn[t] = pd.read_csv(f, **kwargs_)
        missing_tables = {
            t
            for t in self.pan_dat_factory.all_tables if t not in rtn
        }
        if missing_tables:
            print(
                "The following table names could not be found in the %s directory.\n%s\n"
                % (dir_path, "\n".join(missing_tables)))
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following (table, file_name, field) triplets are missing fields.\n%s"
            % [(t, os.path.basename(tbl_names[t]), f)
               for t, f in missing_fields])
        return _clean_pandat_creator(self.pan_dat_factory, rtn)
Exemplo n.º 3
0
    def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs):
        """
        Create a PanDat object from a SQLite database file

        :param db_file_path: the directory containing the .csv files.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param kwargs: additional named arguments to pass to pandas.read_csv

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        verify(os.path.isdir(dir_path), "%s not a directory path"%dir_path)
        tbl_names = self._get_table_names(dir_path)
        rtn = {t: pd.read_csv(f, **kwargs) for t,f in tbl_names.items()}
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following (table, file_name, field) triplets are missing fields.\n%s" %
               [(t, os.path.basename(tbl_names[t]), f) for t,f in missing_fields])
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
Exemplo n.º 4
0
    def create_pan_dat(self,
                       db_file_path,
                       con=None,
                       fill_missing_fields=False):
        """
        Create a PanDat object from a SQLite database file

        :param db_file_path: A SQLite DB File. Set to falsey if using con argument

        :param con: A connection object that can be passed to pandas read_sql.
                    Set to falsey if using db_file_path argument.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always resolve to an empty table, but missing fields on matching tables throw
                 an exception (unless fill_missing_fields is truthy).

                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        verify(
            bool(db_file_path) != bool(con),
            "use either the con argument or the db_file_path argument but not both"
        )
        if db_file_path:
            verify(
                os.path.exists(db_file_path)
                and not os.path.isdir(db_file_path),
                "%s not a file path" % db_file_path)
        rtn = {}
        con_maker = lambda: _sql_con(
            db_file_path) if db_file_path else _DummyContextManager(con)
        with con_maker() as _:
            con_ = con or _
            for t, s in self._get_table_names(con_).items():
                rtn[t] = pd.read_sql(sql="Select * from [%s]" % s, con=con_)
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following are (table, field) pairs missing from the %s file.\n%s"
            % (db_file_path, missing_fields))
        missing_tables = sorted(
            set(self.pan_dat_factory.all_tables).difference(rtn))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        return _clean_pandat_creator(self.pan_dat_factory, rtn)
Exemplo n.º 5
0
    def create_pan_dat(self, xls_file_path, fill_missing_fields=False):
        """
        Create a PanDat object from an Excel file

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching sheets.

        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception (unless fill_missing_fields is truthy).
                 Table names are matched to sheets with with case-space insensitivity, but spaces and
                 case are respected for field names.
                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to excel and then recover it, the type of data might change. For example

            df = pd.DataFrame({"a":["100", "200", "300"]})
            df.to_excel("something.xlsx")
            df2 = pd.read_excel("something.xlsx")

        results in a numeric column in df2. To address this, you need to either use set_data_type for your
        PanDatFactory.

        This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]})
        """
        rtn = {}
        for t, s in self._get_sheet_names(xls_file_path).items():
            rtn[t] = pd.read_excel(
                xls_file_path,
                s,
                dtype=self.pan_dat_factory._dtypes_for_pandas_read(t))
        missing_tables = {
            t
            for t in self.pan_dat_factory.all_tables if t not in rtn
        }
        if missing_tables:
            print(
                "The following table names could not be found in the %s file.\n%s\n"
                % (xls_file_path, "\n".join(missing_tables)))
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following are (table, field) pairs missing from the %s file.\n%s"
            % (xls_file_path, missing_fields))
        return _clean_pandat_creator(self.pan_dat_factory,
                                     rtn,
                                     print_missing_tables=True)
Exemplo n.º 6
0
    def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs):
        """
        Create a PanDat object from a SQLite database file

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
                 +- "inf", "-inf" strings will be converted to +-float("inf")
        """
        if os.path.exists(path_or_buf):
            verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict), "path_or_buf to json.load as a dict")
        verify(all(map(dictish, loaded_dict.values())),
               "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries")

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument")
        rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()}
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields])
        for v in rtn.values():
            v.replace("inf", float("inf"), inplace=True)
            v.replace("-inf", -float("inf"), inplace=True)
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
Exemplo n.º 7
0
    def create_pan_dat(self, db_file_path, con=None, fill_missing_fields=False):
        """
        Create a PanDat object from a SQLite database file

        :param db_file_path: A SQLite DB File. Set to falsey if using con argument

        :param con: sqlalchemy.engine.Engine or sqlite3.Connection.
                    Set to falsey if using db_file_path argument.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always throw an Exception.
                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        verify(bool(db_file_path) != bool(con),
               "use either the con argument or the db_file_path argument but not both")
        if db_file_path:
            verify(os.path.exists(db_file_path) and not os.path.isdir(db_file_path),
                   "%s not a file path"%db_file_path)
        rtn = {}
        con_maker = lambda: _sql_con(db_file_path) if db_file_path else _DummyContextManager(con)
        with con_maker() as _:
            con_ = con or _
            for t, s in self._get_table_names(con_).items():
                rtn[t] = pd.read_sql(sql="Select * from [%s]"%s, con=con_)
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following are (table, field) pairs missing from the %s file.\n%s" % (db_file_path, missing_fields))
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
Exemplo n.º 8
0
    def create_pan_dat(self, xls_file_path, fill_missing_fields=False):
        """
        Create a PanDat object from an Excel file

        :param xls_file_path: An Excel file containing sheets whose names match
                              the table names in the schema.

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :return: a PanDat object populated by the matching sheets.

        caveats: Missing sheets resolve to an empty table, but missing fields
                 on matching sheets throw an Exception (unless fill_missing_fields is falsey).
                 Table names are matched to sheets with with case-space insensitivity, but spaces and
                 case are respected for field names.
                 (ticdat supports whitespace in field names but not table names).
        """
        rtn = {}
        for t, s in self._get_sheet_names(xls_file_path).items():
            rtn[t] = pd.read_excel(xls_file_path, s)
        missing_tables = {t for t in self.pan_dat_factory.all_tables if t not in rtn}
        if missing_tables:
            print ("The following table names could not be found in the %s file.\n%s\n"%
                   (xls_file_path,"\n".join(missing_tables)))
        missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t,f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(fill_missing_fields or not missing_fields,
               "The following are (table, field) pairs missing from the %s file.\n%s" % (xls_file_path, missing_fields))
        rtn = self.pan_dat_factory.PanDat(**rtn)
        msg = []
        assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg)
        return rtn
Exemplo n.º 9
0
    def create_pan_dat(self,
                       path_or_buf,
                       fill_missing_fields=False,
                       orient='split',
                       **kwargs):
        """
        Create a PanDat object from a JSON file or string

        :param path_or_buf:  a valid JSON string or file-like

        :param fill_missing_fields: boolean. If truthy, missing fields will be filled in
                                    with their default value. Otherwise, missing fields
                                    throw an Exception.

        :param orient: Indication of expected JSON string format. See pandas.read_json for more details.

        :param kwargs: additional named arguments to pass to pandas.read_json

        :return: a PanDat object populated by the matching tables.

        caveats: Missing tables always resolve to an empty table.

                 Table names are matched with case-space insensitivity, but spaces
                 are respected for field names.

                 (ticdat supports whitespace in field names but not table names).

        Note that if you save a DataFrame to json and then recover it, the type of data might change.
        Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros.
        To address this, you need to either use set_data_type for your
        PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better).
        """
        if stringish(path_or_buf) and os.path.exists(path_or_buf):
            verify(
                os.path.isfile(path_or_buf),
                "%s appears to be a directory and not a file." % path_or_buf)
            with open(path_or_buf, "r") as f:
                loaded_dict = json.load(f)
        else:
            verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf)
            loaded_dict = json.loads(path_or_buf)
        verify(dictish(loaded_dict),
               "the json.load result doesn't resolve to a dictionary")
        verify(
            all(map(dictish, loaded_dict.values())),
            "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries"
        )

        tbl_names = self._get_table_names(loaded_dict)
        verify("orient" not in kwargs,
               "orient should be passed as a non-kwargs argument")
        rtn = {}
        for t, f in tbl_names.items():
            kwargs_ = dict(kwargs)
            if "dtype" not in kwargs_:
                kwargs_[
                    "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t)
            rtn[t] = pd.read_json(json.dumps(loaded_dict[f]),
                                  orient=orient,
                                  **kwargs_)
        missing_fields = {(t, f)
                          for t in rtn
                          for f in all_fields(self.pan_dat_factory, t)
                          if f not in rtn[t].columns}
        if fill_missing_fields:
            for t, f in missing_fields:
                rtn[t][f] = self.pan_dat_factory.default_values[t][f]
        verify(
            fill_missing_fields or not missing_fields,
            "The following (table, field) pairs are missing fields.\n%s" %
            [(t, f) for t, f in missing_fields])
        missing_tables = sorted(
            set(self.pan_dat_factory.all_tables).difference(rtn))
        if missing_tables:
            print(
                "The following table names could not be found in the SQLite database.\n%s\n"
                % "\n".join(missing_tables))
        return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True)