Exemplo n.º 1
0
    def pretty_print(
        self,
        fmt: Union[None, str, TableFormat] = None,
        *,
        to: Optional[PathLike] = None,
        mode: str = "w",
        **kwargs,
    ) -> str:
        """
        Outputs a pretty table using the `tabulate <https://pypi.org/project/tabulate/>`_ package.

        Args:
            fmt: A tabulate format; if None, chooses according to ``to``, falling back to ``"plain"``
            to: Write to this path (.gz, .zip, etc. is inferred)
            mode: Write mode: 'w', 'a', or 'x'
            kwargs: Passed to tabulate

        Returns:
            The formatted string
        """
        fmt = Utils.choose_table_format(path=to, fmt=fmt)
        s = self._tabulate(fmt, **kwargs)
        if to is not None:
            Utils.write(to, s, mode=mode)
        return s
Exemplo n.º 2
0
    def sort_natural(
        self, column: str, *, alg: Union[None, int, Set[str]] = None, reverse: bool = False
    ) -> __qualname__:
        """
        Calls ``natsorted`` on a single column.

        Args:
            column: The name of the (single) column to sort by
            alg: Input as the ``alg`` argument to ``natsorted``
                 If ``None``, the "best" algorithm is chosen from the dtype of ``column``
                 via :meth:`typeddfs.utils.Utils.guess_natsort_alg`.
                 Otherwise, :meth:typeddfs.utils.Utils.exact_natsort_alg`
                 is called with ``Utils.exact_natsort_alg(alg)``.
            reverse: Reverse the sort order (e.g. 'z' before 'a')
        """
        df = self.vanilla_reset()
        if alg is None:
            _, alg = Utils.guess_natsort_alg(self[column].dtype)
        else:
            _, alg = Utils.exact_natsort_alg(alg)
        zzz = natsorted([s for s in df[column]], alg=alg, reverse=reverse)
        df["__sort"] = df[column].map(lambda s: zzz.index(s))
        df.__class__ = self.__class__
        df = df.sort_values("__sort").drop("__sort", axis=1)
        return self.__class__._change(df)
Exemplo n.º 3
0
 def test_dots_and_dicts(self):
     dct = dict(abc=dict(xyz="123"), zzz=["456", "789"])
     dots = {"abc.xyz": "123", "zzz": ["456", "789"]}
     act_dots = Utils.dict_to_dots(dct)
     assert act_dots == dots
     act_dct = Utils.dots_to_dict(act_dots)
     assert act_dct == dct
Exemplo n.º 4
0
    def get_short_text(self, *, recommended_only: bool = False) -> str:
        """
        Returns a single-line text listing of allowed file formats.

        Args:
            recommended_only: Skip non-recommended file formats

        Returns:
            Something like::
                .csv, .tsv/.tab, or .flexwf [.gz,/.xz,/.zip/.bz2]; .feather, .pickle, or .snappy ...
        """
        fmts = [
            f for f in self if not recommended_only or f.fmt.is_recommended
        ]
        text_fmts = Utils.natsort(
            ["/".join(f.bare_suffixes) for f in fmts if f.fmt.is_text],
            dtype=str)
        bin_fmts = Utils.natsort(
            ["/".join(f.bare_suffixes) for f in fmts if f.fmt.is_binary],
            dtype=str)
        txt = ""
        if len(text_fmts) > 0:
            txt += (Utils.join_to_str(*text_fmts, last="or") + " [" + "/".join(
                [s.suffix for s in CompressionFormat.list_non_empty()]) + "]")
        if len(bin_fmts) > 0:
            txt += ("; " if len(text_fmts) > 0 else "") + Utils.join_to_str(
                *bin_fmts, last="or")
        return txt
Exemplo n.º 5
0
 def bare_suffixes(self) -> Sequence[str]:
     """
     Returns all suffixes, excluding compressed variants (etc. ``.gz``), naturally sorted.
     """
     suffixes = {
         CompressionFormat.strip_suffix(s).name
         for s in self.fmt.suffixes
     }
     return Utils.natsort(suffixes, str)
Exemplo n.º 6
0
 def strip_control_chars(self) -> __qualname__:
     """
     Removes all control characters (Unicode group 'C') from all string-typed columns.
     """
     df = self.vanilla_reset()
     for c in df.columns:
         if Utils.is_string_dtype(df[c]):
             df[c] = df[c].map(Utils.strip_control_chars)
     return self.__class__._convert_typed(df)
Exemplo n.º 7
0
 def test_list(self):
     x: FrozeList = Utils.freeze([1, 2, 3])
     assert isinstance(x, FrozeList)
     assert x.to_list() == [1, 2, 3]
     assert str(x) == str(x.to_list())
     assert repr(x) == repr(x.to_list())
     y: FrozeList = Utils.freeze([1, 2, 1])
     assert x == x and y == y
     assert not x < x and not y < y
     assert x > y
     assert hash(x) == hash(x)
     assert hash(x) != hash(y)
     assert x.get(1) == 1
     assert x.get(5) is None
     assert x.get(5, 100) == 100
     assert x.req(1) == 1
     assert x.req(5, 100) == 100
     with pytest.raises(KeyError):
         x.req(5)
Exemplo n.º 8
0
    def _build(self) -> Type[BaseDf]:
        if self._secure and self._hash_alg in Utils.insecure_hash_functions():
            raise DfTypeConstructionError(
                f"Hash algorithm {self._hash_alg} forbidden by .secure()")
        self._check_final()

        _io_typing = IoTyping[BaseDf](
            _remap_suffixes=dict(self._remapped_suffixes),
            _text_encoding=self._encoding,
            _read_kwargs=dict(self._read_kwargs),
            _write_kwargs=dict(self._write_kwargs),
            _hash_alg=self._hash_alg,
            _save_hash_file=self._hash_file,
            _save_hash_dir=self._hash_dir,
            _secure=self._secure,
            _recommended=self._recommended,
            _attrs_suffix=_DEFAULT_ATTRS_SUFFIX
            if self._attr_suffix is None else self._attr_suffix,
            _use_attrs=self._attr_suffix is not None,
            _attrs_json_kwargs=self._attr_json_kwargs,
            _custom_readers={k: v[0]
                             for k, v in self._custom_formats.items()},
            _custom_writers={k: v[1]
                             for k, v in self._custom_formats.items()},
        )

        _typing = DfTyping(
            _io_typing=_io_typing,
            _auto_dtypes=dict(self._dtypes),
            _post_processing=self._post_processing,
            _verifications=self._verifications,
            _more_index_names_allowed=not self._strict_meta,
            _more_columns_allowed=not self._strict_cols,
            _required_columns=list(self._req_cols),
            _required_index_names=list(self._req_meta),
            _reserved_columns=list(self._res_cols),
            _reserved_index_names=list(self._res_meta),
            _columns_to_drop=set(self._drop),
            _index_series_name=self._index_series_name,
            _column_series_name=self._column_series_name,
            _value_dtype=self._value_dtype,
        )

        class New(self._clazz, *self._classes):
            @classmethod
            def get_typing(cls) -> DfTyping:
                return _typing

        New.__name__ = self._name
        New.__doc__ = self._doc
        for k, v in self._methods.items():
            setattr(New, k, v)
        for k, v in self._classmethods.items():
            setattr(New, k, classmethod(v))
        return New
Exemplo n.º 9
0
 def _cols(self, which: Sequence[str], *, short: bool) -> Sequence[str]:
     lst = []
     for c in which:
         t = self.typing.auto_dtypes.get(c)
         if t is not None:
             t = Utils.describe_dtype(t, short=short)
         if t is None:
             lst.append(c)
         else:
             lst.append(f"{c} ({t})")
     return lst
Exemplo n.º 10
0
 def get_short_typing_text(self) -> str:
     """
     Returns a short text description of the required format for a matrix.
     """
     t = self.typing
     if t.value_dtype is None:
         s = "Matrix. "
     else:
         s = Utils.describe_dtype(t.value_dtype).capitalize()
         s += f" ({t.value_dtype.__name__}) matrix. "
     s += "List row names in the index or a special column 'row'."
     return s
Exemplo n.º 11
0
 def test_set(self):
     x: FrozeSet = Utils.freeze({1, 2, 3})
     assert isinstance(x, FrozeSet)
     assert x.to_set() == {1, 2, 3}
     assert str(x) == str(x.to_set())
     assert repr(x) == repr(x.to_set())
     assert x.to_frozenset() == frozenset({1, 2, 3})
     y: FrozeSet = Utils.freeze({1, 2, 1})
     assert x == x and y == y
     assert not x < x and not y < y
     assert x > y
     assert hash(x) == hash(x)
     assert hash(x) != hash(y)
     assert not x.isdisjoint(y)
     assert x.get(1) == 1
     assert x.get(5) is None
     assert x.get(5, 100) == 100
     assert x.req(1) == 1
     assert x.req(5, 100) == 100
     with pytest.raises(KeyError):
         x.req(5)
Exemplo n.º 12
0
 def to_rst(self,
            path_or_none: Optional[PathLike] = None,
            style: str = "simple",
            mode: str = "w") -> Optional[str]:
     """
     Writes a reStructuredText table.
     Args:
         path_or_none: Either a file path or ``None`` to return the string
         style: The type of table; currently only "simple" is supported
         mode: Write mode
     """
     txt = self._tabulate(fmt="rst") + "\n"
     return Utils.write(path_or_none, txt, mode=mode)
Exemplo n.º 13
0
 def _read_properties_like(
     cls,
     unescape_keys,
     unescape_values,
     comment_chars: Set[str],
     strip_quotes: bool,
     path_or_buff,
     **kwargs,
 ) -> __qualname__:
     r"""
     Reads a .properties-like file.
     """
     cls._assert_can_write_properties_class()
     if len(cls.get_typing().required_names) == 2:
         key_col, val_col = cls.get_typing().required_names
     else:
         key_col, val_col = "key", "value"
     txt = Utils.read(path_or_buff, **kwargs)
     keys = []
     values = []
     section = ""
     for i, line in enumerate(txt.splitlines()):
         try:
             line = line.strip()
             if any((line.startswith(c)
                     for c in comment_chars)) or len(line.strip()) == 0:
                 continue
             if line.startswith("["):
                 # treat [ ] (with spaces) as the global key
                 section = line.lstrip("[").rstrip("]").strip()
                 continue
             key, value = line.split("=")
             key, value = key.strip(), value.strip()
             if unescape_keys is not None:
                 key = unescape_keys(key)
             if value.endswith("\\"):
                 raise ValueError(
                     "Ends with \\; continued lines are not yet supported")
             if unescape_values is not None:
                 value = unescape_values(value)
             if strip_quotes:
                 value = value.strip('"')
             if section != "":
                 key = section + "." + key
             keys.append(key)
             values.append(value)
         except ValueError:
             raise ValueError(f"Malformed line {i}: '{line}'")
     df = pd.DataFrame({key_col: keys, val_col: values})
     return cls.convert(df)
Exemplo n.º 14
0
    def sort_natural_index(self, *, alg: int = None, reverse: bool = False) -> __qualname__:
        """
        Calls natsorted on this index. Works for multi-index too.

        Args:
            alg: Input as the ``alg`` argument to ``natsorted``
                 If ``None``, the "best" algorithm is chosen from the dtype of ``column``
                 via :meth:`typeddfs.utils.Utils.guess_natsort_alg`.
                 Otherwise, :meth:typeddfs.utils.Utils.exact_natsort_alg`
                 is called with ``Utils.exact_natsort_alg(alg)``.
            reverse: Reverse the sort order (e.g. 'z' before 'a')
        """
        df = self.copy()
        if alg is None:
            # TODO: Does this work for multi-index?
            _, alg = Utils.guess_natsort_alg(self.index.dtype)
        else:
            _, alg = Utils.exact_natsort_alg(alg)
        zzz = natsorted([s for s in df.index], alg=alg)
        df["__sort"] = df.index.map(lambda s: zzz.index(s))
        df.__class__ = self.__class__
        df = df.sort_values("__sort").drop_cols(["__sort"])
        return self.__class__._change(df)
Exemplo n.º 15
0
 def _get_write_kwargs(cls, fmt: Optional[FileFormat],
                       path: Path) -> Mapping[str, Any]:
     t = cls.get_typing().io
     real_suffix = CompressionFormat.strip_suffix(path).suffix
     kwargs = t.write_kwargs.get(fmt, {})
     kwargs.update(t.write_suffix_kwargs.get(real_suffix, {}))
     if fmt is FileFormat.json:
         # not perfect, but much better than the alternative of failing
         # I don't see a better solution anyway
         kwargs["force_ascii"] = False
     elif (fmt is not None and fmt.supports_encoding
           ):  # and IS NOT JSON -- it doesn't use "encoding="
         encoding = kwargs.get("encoding", t.text_encoding)
         kwargs["encoding"] = Utils.get_encoding(encoding)
     return kwargs
Exemplo n.º 16
0
 def test_dict(self):
     x: FrozeDict = Utils.freeze({1: "cat", 2: "dog"})
     assert isinstance(x, FrozeDict)
     assert str(x) == str(x.to_dict())
     assert repr(x) == repr(x.to_dict())
     y: FrozeDict = Utils.freeze({1: "cat", 2: "zebra"})
     z: FrozeDict = Utils.freeze({2: "cat", 3: "aardvark"})
     assert x == x and y == y and z == z
     assert x != y and x != z and y != z
     assert x < z
     assert x < y
     assert y < z
     assert not x < x
     assert not y < y
     assert not z < z
     assert hash(x) == hash(x) and hash(y) == hash(y) and hash(z) == hash(z)
     assert hash(x) != hash(y)
     assert x.get(1) == "cat"
     assert x.get(5) is None
     assert x.get(5, "elephant") == "elephant"
     assert x.req(1) == "cat"
     assert x.req(5, "elephant") == "elephant"
     with pytest.raises(KeyError):
         x.req(5)
Exemplo n.º 17
0
 def _to_properties_like(
     self,
     escape_keys,
     escape_values,
     sep: str,
     comment_char: str,
     path_or_buff=None,
     mode: str = "w",
     comment: Union[None, str, Sequence[str]] = None,
     **kwargs,
 ) -> Optional[str]:
     r"""
     Writes a .properties-like file.
     """
     comment = [] if comment is None else (
         [comment] if isinstance(comment, str) else comment)
     self.__class__._assert_can_write_properties_class()
     self._assert_can_write_properties_instance()
     df = self.vanilla_reset()
     if len(self.__class__.get_typing().required_names) == 2:
         key_col, val_col = self.__class__.get_typing().required_names
     else:
         key_col, val_col = "key", "value"
     df.columns = [key_col, val_col]
     df = df.sort_values(key_col)  # essential
     lines = [
         comment_char.lstrip(comment_char).lstrip() + " " + c
         for c in comment
     ]
     section = ""
     for k, v in zip(df[key_col], df[val_col]):
         if "." in k:
             k, s = str(k).split(".", 1)
             s, k = k.strip(), s.strip()
             if s != section:
                 lines.append(f"[{s}]")
         if escape_keys:
             k = escape_keys(k)
         if escape_values:
             v = escape_values(v)
         lines.append(k + " " + sep + " " + v.strip('"'))
     return Utils.write(path_or_buff,
                        os.linesep.join(lines),
                        mode=mode,
                        **kwargs)
Exemplo n.º 18
0
 def _get_read_kwargs(cls, fmt: Optional[FileFormat],
                      path: Path) -> Mapping[str, Any]:
     t = cls.get_typing().io
     real_suffix = CompressionFormat.strip_suffix(path).suffix
     kwargs = t.read_kwargs.get(fmt, {})
     kwargs.update(t.read_suffix_kwargs.get(real_suffix, {}))
     if fmt in [
             FileFormat.csv,
             FileFormat.tsv,
             FileFormat.properties,
             FileFormat.lines,
             FileFormat.flexwf,
             FileFormat.fwf,
             FileFormat.json,
     ]:
         encoding = kwargs.get("encoding", t.text_encoding)
         kwargs["encoding"] = Utils.get_encoding(encoding)
     return kwargs
Exemplo n.º 19
0
 def get_short_typing_text(self) -> str:
     """
     Returns a condensed text description of the required and optional columns.
     """
     t = self.typing
     req = self.get_required_cols(short=True)
     res = self.get_reserved_cols(short=True)
     s = ""
     if len(req) > 0:
         s += f"Requires columns {Utils.join_to_str(*req, last='and')}."
     if len(res) > 0:
         s += ((" " if len(s) > 0 else " ") + "Columns " +
               Utils.join_to_str(*res, last="and") + " are optional.")
     s += " "
     if t.is_strict:
         s += "More columns are ok."
     else:
         s += "No extra columns are allowed."
     return s
Exemplo n.º 20
0
    def get_long_text(
        self,
        *,
        recommended_only: bool = False,
        nl: str = "\n",
        bullet: str = "- ",
        indent: str = "  ",
    ) -> str:
        r"""
        Returns a multi-line text listing of allowed file formats.

        Args:
            recommended_only: Skip non-recommended file formats
            nl: Newline characters; use "\n", "\\n", or " "
            bullet: Prepended to each item
            indent: Spaces for nested indent

        Returns:
            Something like::
                [[ Supported formats ]]:

                .csv[.bz2/.gz/.xz/.zip]: comma-delimited

                .parquet/.snappy: Parquet

                .h5/.hdf/.hdf5: HDF5 (key 'df') [discouraged]

                .pickle/.pkl: Python Pickle [discouraged]
        """
        bullet = nl + indent + bullet
        fmts = [
            f for f in self if not recommended_only or f.fmt.is_recommended
        ]
        formats = [
            f.get_text() + ("" if f.fmt.is_recommended else " [avoid]")
            for f in fmts
        ]
        formats = Utils.natsort(formats, str)
        txt = bullet + bullet.join(formats)
        return f"[[ Supported formats ]]: {txt}"
Exemplo n.º 21
0
    def read_file(
        cls,
        path: Union[Path, str],
        *,
        file_hash: Optional[bool] = None,
        dir_hash: Optional[bool] = None,
        hex_hash: Optional[str] = None,
        attrs: Optional[bool] = None,
    ) -> __qualname__:
        """
        Reads from a file (or possibly URL), guessing the format from the filename extension.
        Delegates to the ``read_*`` functions of this class.

        You can always write and then read back to get the same dataframe.
        .. code-block::

            # df is any DataFrame from typeddfs
            # path can use any suffix
            df.write_file(path))
            df.read_file(path)

        Text files always allow encoding with .gz, .zip, .bz2, or .xz.

        Supports:
            - .csv, .tsv, or .tab
            - .json
            - .xml
            - .feather
            - .parquet or .snappy
            - .h5 or .hdf
            - .xlsx, .xls, .odf, etc.
            - .toml
            - .properties
            - .ini
            - .fxf (fixed-width)
            - .flexwf (fixed-but-unspecified-width with an optional delimiter)
            - .txt, .lines, or .list

        See Also:
            :meth:`read_url`
            :meth:`write_file`


        Args:
            path: Only path-like strings or pathlib objects are supported, not buffers
                  (because we need a filename).
            file_hash: Check against a hash file specific to this file (e.g. <path>.sha1)
            dir_hash: Check against a per-directory hash file
            hex_hash: Check against this hex-encoded hash
            attrs: Set dataset attributes/metadata (``pd.DataFrame.attrs``) from a JSON file.
                   If True, uses :attr:`typeddfs.df_typing.DfTyping.attrs_suffix`.
                   If a str or Path, uses that file.
                   If None or False, does not set.

        Returns:
            An instance of this class
        """
        if any((str(path).startswith(x + "://")
                for x in ["http", "https", "ftp"])):
            # just save some pain -- better than a weird error in .resolve()
            raise ValueError(
                f"Cannot read from URL {path}; use read_url instead")
        path = Path(path).resolve()
        t: DfTyping = cls.get_typing()
        if attrs is None:
            attrs = t.io.use_attrs
        cs = Checksums(alg=t.io.hash_algorithm)
        cs.verify_any(path,
                      file_hash=file_hash,
                      dir_hash=dir_hash,
                      computed=hex_hash)
        df = cls._call_read(cls, path)
        if attrs:
            attrs_path = path.parent / (path.name + t.io.attrs_suffix)
            json_data = Utils.json_decoder().from_str(
                attrs_path.read_text(encoding="utf-8"))
            df.attrs.update(json_data)
        return cls._convert_typed(df)
Exemplo n.º 22
0
 def unhashable_list(self):
     x: FrozeList = Utils.freeze([[1]])
     y: FrozeList = Utils.freeze([[1]])
     assert hash(x) == 1
     assert {x} != {y}
Exemplo n.º 23
0
 def test_encoding(self):
     assert Utils.get_encoding("platform") == sys.getdefaultencoding()
     assert "bom" not in Utils.get_encoding("utf8(bom)")
     assert "bom" not in Utils.get_encoding("utf16(bom)")
     assert Utils.get_encoding("UTF-8") == "utf8"
     assert Utils.get_encoding("utf-16") == "utf16"
Exemplo n.º 24
0
 def test_strip_control_chars(self):
     assert Utils.strip_control_chars("ab\ncd") == "abcd"
     assert Utils.strip_control_chars("ab\0\0cℶd") == "abcℶd"
     assert Utils.strip_control_chars("ℶℶ\u202Cℶℶ") == "ℶℶℶℶ"
     assert Utils.strip_control_chars("\u202C") == ""
Exemplo n.º 25
0
 def test_basic(self):
     assert "sha1" in Utils.insecure_hash_functions()
     assert "__xml_index_" in Utils.banned_names()
Exemplo n.º 26
0
 def test_table_formats(self):
     formats = list(Utils.table_formats())
     assert len(formats) > 10
     assert "simple" in formats
     x = Utils.table_format("simple")
     assert isinstance(x, TableFormat)
Exemplo n.º 27
0
    def write_file(
        self,
        path: Union[Path, str],
        *,
        overwrite: bool = True,
        mkdirs: bool = False,
        file_hash: Optional[bool] = None,
        dir_hash: Optional[bool] = None,
        attrs: Optional[bool] = None,
    ) -> Optional[str]:
        """
        Writes to a file, guessing the format from the filename extension.
        Delegates to the ``to_*`` functions of this class (e.g. ``to_csv``).
        Only includes file formats that can be read back in with corresponding ``to`` methods.

        Supports, where text formats permit optional .gz, .zip, .bz2, or .xz:
            - .csv, .tsv, or .tab
            - .json
            - .feather
            - .fwf (fixed-width)
            - .flexwf (columns aligned but using a delimiter)
            - .parquet or .snappy
            - .h5, .hdf, or .hdf5
            - .xlsx, .xls, and other variants for Excel
            - .odt and .ods (OpenOffice)
            - .xml
            - .toml
            - .ini
            - .properties
            - .pkl and .pickle
            - .txt, .lines, or .list; see :meth:`to_lines` and :meth:`read_lines`

        See Also:
            :meth:`read_file`

        Args:
            path: Only path-like strings or pathlib objects are supported, not buffers
                  (because we need a filename).
            overwrite: If False, complain if the file already exists
            mkdirs: Make the directory and parents if they do not exist
            file_hash: Write a hash for this file.
                       The filename will be path+"."+algorithm.
                       If None, chooses according to ``self.get_typing().io.hash_file``.
            dir_hash: Append a hash for this file into a list.
                      The filename will be the directory name suffixed by the algorithm;
                      (i.e. path.parent/(path.parent.name+"."+algorithm) ).
                      If None, chooses according to ``self.get_typing().io.hash_dir``.
            attrs: Write dataset attributes/metadata (``pd.DataFrame.attrs``) to a JSON file.
                   uses :attr:`typeddfs.df_typing.DfTyping.attrs_suffix`.
                   If None, chooses according to ``self.get_typing().io.use_attrs``.

        Returns:
            Whatever the corresponding method on ``pd.to_*`` returns.
            This is usually either str or None

        Raises:
            InvalidDfError: If the DataFrame is not valid for this type
            ValueError: If the type of a column or index name is non-str
        """
        if any((str(path).startswith(x + "://")
                for x in ["http", "https", "ftp"])):
            # just save some pain -- better than a weird error in .resolve()
            raise ValueError(f"Cannot write to URL {path}")
        path = Path(path).resolve()
        t = self.__class__.get_typing()
        file_hash = file_hash is True or file_hash is None and t.io.file_hash
        dir_hash = dir_hash is True or dir_hash is None and t.io.dir_hash
        attrs = attrs is True or attrs is None and t.io.use_attrs
        attrs_path = path.parent / (path.name + t.io.attrs_suffix)
        attrs_data = Utils.json_encoder().as_str(self.attrs)
        cs = Checksums(alg=t.io.hash_algorithm)
        file_hash_path = cs.get_filesum_of_file(path)
        dir_hash_path = cs.get_dirsum_of_file(path)
        # check for overwrite errors now to preserve atomicity
        if not overwrite:
            if path.exists():
                raise FileExistsError(f"File {path} already exists")
            if file_hash and file_hash_path.exists():
                raise HashFileExistsError(f"{file_hash_path} already exists")
            if dir_hash_path.exists():
                dir_sums = Checksums(
                    alg=t.io.hash_algorithm).load_dirsum_exact(dir_hash_path)
                if path in dir_sums:
                    raise HashEntryExistsError(
                        f"Path {path} listed in {dir_hash_path}")
            if file_hash and file_hash_path.exists():
                raise HashFileExistsError(f"{file_hash_path} already exists")
            if attrs and attrs_path.exists():
                raise FileExistsError(f"{attrs_path} already exists")
        self._check(self)
        types = set(self.column_names()).union(self.index_names())
        if any((not isinstance(c, str) for c in types)):
            raise NonStrColumnError(
                f"Columns must be of str type to serialize, not {types}")
        # now we're ready to write
        if mkdirs:
            path.parent.mkdir(exist_ok=True, parents=True)
        # to get a FileNotFoundError instead of a WritePermissionsError:
        if not mkdirs and not path.parent.exists():
            raise FileNotFoundError(f"Directory {path.parent} not found")
        # check for lack of write-ability to any of the files
        # we had to do this after creating the dirs unfortunately
        _all_files = [(attrs, attrs_path), (file_hash, file_hash_path),
                      (dir_hash, dir_hash_path)]
        all_files = [f for a, f in _all_files if a]
        all_dirs = [f.parent for (a, f) in _all_files]
        # we need to check both the dirs and the files
        Utils.verify_can_write_dirs(*all_dirs, missing_ok=False)
        Utils.verify_can_write_files(*all_files, missing_ok=True)
        # we verified as much as we can -- finally we can write!!
        # this writes the main file:
        z = self._call_write(path)
        # write the hashes
        # this shouldn't fail
        cs = Checksums(alg=t.io.hash_algorithm)
        cs.write_any(
            path,
            to_file=file_hash,
            to_dir=dir_hash,
            overwrite=overwrite,
        )
        # write dataset attributes
        # this also shouldn't fail
        if attrs:
            attrs_path.write_text(attrs_data, encoding="utf8")
        return z
Exemplo n.º 28
0
 def all_suffixes(self) -> Sequence[str]:
     """
     Returns all suffixes, naturally sorted.
     """
     return Utils.natsort(self.fmt.suffixes, str)
Exemplo n.º 29
0
 def is_text_encoding_utf(self) -> bool:
     return Utils.get_encoding(
         self._text_encoding) in ["utf-8", "utf-16", "utf-32"]
Exemplo n.º 30
0
    def to_fwf(
        self,
        path_or_buff=None,
        mode: str = "w",
        colspecs: Optional[Sequence[Tuple[int, int]]] = None,
        widths: Optional[Sequence[int]] = None,
        na_rep: Optional[str] = None,
        float_format: Optional[str] = None,
        date_format: Optional[str] = None,
        decimal: str = ".",
        **kwargs,
    ) -> Optional[str]:
        """
        Writes a fixed-width text format.
        See ``read_fwf`` and ``to_flexwf`` for more info.

        .. warning:

            This method is a preview. Not all options are complete, and
            behavior is subject to change in a future (major) version.
            Notably, Pandas may eventually introduce a method with the same name.

        Args:
            path_or_buff: Path or buffer
            mode: write or append (w/a)
            colspecs: A list of tuples giving the extents of the fixed-width fields of each line
                      as half-open intervals (i.e., [from, to[ )
            widths: A list of field widths which can be used instead of ``colspecs``
                   if the intervals are contiguous
            na_rep: Missing data representation
            float_format: Format string for floating point numbers
            date_format: Format string for datetime objects
            decimal: Character recognized as decimal separator. E.g. use ‘,’ for European data.
            kwargs: Passed to :meth:`typeddfs.utils.Utils.write`

        Returns:
            The string data if ``path_or_buff`` is a buffer; None if it is a file
        """
        if colspecs is not None and widths is not None:
            raise ValueError("Both widths and colspecs passed")
        if widths is not None:
            colspecs = []
            at = 0
            for w in widths:
                colspecs.append((at, at + w))
                at += w
        # if colspecs is None:
        if True:
            # TODO: use format, etc.
            content = self._tabulate(Utils.plain_table_format(sep=" "),
                                     disable_numparse=True)
        else:
            df = self.vanilla_reset()
            if len(df.columns) != len(colspecs):
                raise ValueError(
                    f"{colspecs} column intervals for {len(df.columns)} columns"
                )
            for col, (start, end) in zip(df.columns, colspecs):
                width = end - start
                mx = df[col].map(str).map(len).max()
                if mx > width:
                    raise ValueError(
                        f"Column {col} has max length {mx} > {end-start}")
            _number_format = {
                "na_rep": na_rep,
                "float_format": float_format,
                "date_format": date_format,
                "quoting": csv.QUOTE_NONE,
                "decimal": decimal,
            }
            res = df._mgr.to_native_types(**_number_format)
            data: Sequence[Sequence[Any]] = [
                res.iget_values(i) for i in range(len(res.items))
            ]
            content = None  # TODO
        if path_or_buff is None:
            return content
        _encoding = dict(
            encoding=kwargs.get("encoding")) if "encoding" in kwargs else {}
        _compression = dict(encoding=kwargs.get(
            "compression")) if "compression" in kwargs else {}
        Utils.write(path_or_buff,
                    content,
                    mode=mode,
                    **_encoding,
                    **_compression)