Пример #1
0
    def save(self):
        """
        Create the writer & save
        """
        # GH21227 internal compression is not used when file-like passed.
        if self.compression and hasattr(self.path_or_buf, 'write'):
            msg = ("compression has no effect when passing file-like "
                   "object as input.")
            warnings.warn(msg, RuntimeWarning, stacklevel=2)

        # when zip compression is called.
        is_zip = isinstance(self.path_or_buf, ZipFile) or (
            not hasattr(self.path_or_buf, 'write')
            and self.compression == 'zip')

        if is_zip:
            # zipfile doesn't support writing string to archive. uses string
            # buffer to receive csv writing and dump into zip compression
            # file handle. GH21241, GH21118
            f = StringIO()
            close = False
        elif hasattr(self.path_or_buf, 'write'):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(self.path_or_buf, self.mode,
                                     encoding=self.encoding,
                                     compression=self.compression)
            close = True

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if self.encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = self.encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            if is_zip:
                # GH17778 handles zip compression separately.
                buf = f.getvalue()
                if hasattr(self.path_or_buf, 'write'):
                    self.path_or_buf.write(buf)
                else:
                    f, handles = _get_handle(self.path_or_buf, self.mode,
                                             encoding=self.encoding,
                                             compression=self.compression)
                    f.write(buf)
                    close = True
            if close:
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #2
0
    def test_to_csv_compression(self, s, encoding, compression):

        with ensure_clean() as filename:

            s.to_csv(filename, compression=compression, encoding=encoding,
                     header=True)
            # test the round trip - to_csv -> read_csv
            result = pd.read_csv(filename, compression=compression,
                                 encoding=encoding, index_col=0, squeeze=True)
            assert_series_equal(s, result)

            # test the round trip using file handle - to_csv -> read_csv
            f, _handles = _get_handle(filename, 'w', compression=compression,
                                      encoding=encoding)
            with f:
                s.to_csv(f, encoding=encoding, header=True)
            result = pd.read_csv(filename, compression=compression,
                                 encoding=encoding, index_col=0, squeeze=True)
            assert_series_equal(s, result)

            # explicitly ensure file was compressed
            with tm.decompress_file(filename, compression) as fh:
                text = fh.read().decode(encoding or 'utf8')
                assert s.name in text

            with tm.decompress_file(filename, compression) as fh:
                assert_series_equal(s, pd.read_csv(fh,
                                                   index_col=0,
                                                   squeeze=True,
                                                   encoding=encoding))
Пример #3
0
    def test_to_csv_compression(self, df, encoding, compression):

        with ensure_clean() as filename:

            df.to_csv(filename, compression=compression, encoding=encoding)
            # test the round trip - to_csv -> read_csv
            result = read_csv(filename, compression=compression,
                              index_col=0, encoding=encoding)
            assert_frame_equal(df, result)

            # test the round trip using file handle - to_csv -> read_csv
            f, _handles = _get_handle(filename, 'w', compression=compression,
                                      encoding=encoding)
            with f:
                df.to_csv(f, encoding=encoding)
            result = pd.read_csv(filename, compression=compression,
                                 encoding=encoding, index_col=0, squeeze=True)
            assert_frame_equal(df, result)

            # explicitly make sure file is compressed
            with tm.decompress_file(filename, compression) as fh:
                text = fh.read().decode(encoding or 'utf8')
                for col in df.columns:
                    assert col in text

            with tm.decompress_file(filename, compression) as fh:
                assert_frame_equal(df, read_csv(fh,
                                                index_col=0,
                                                encoding=encoding))
Пример #4
0
    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.
        """

        data = filepath_or_buffer

        exists = False
        if isinstance(data, compat.string_types):
            try:
                exists = os.path.exists(filepath_or_buffer)
            # gh-5874: if the filepath is too long will raise here
            except (TypeError, ValueError):
                pass

        if exists or self.compression is not None:
            data, _ = _get_handle(filepath_or_buffer, 'r',
                                  encoding=self.encoding,
                                  compression=self.compression)
            self.should_close = True
            self.open_stream = data

        return data
Пример #5
0
    def save(self):
        # create the writer & save
        if self.encoding is None:
            if compat.PY2:
                encoding = 'ascii'
            else:
                encoding = 'utf-8'
        else:
            encoding = self.encoding

        if hasattr(self.path_or_buf, 'write'):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(self.path_or_buf, self.mode,
                                     encoding=encoding,
                                     compression=None)
            close = True if self.compression is None else False

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            # GH 17778 handles compression for byte strings.
            if not close and self.compression:
                f.close()
                with open(self.path_or_buf, 'r') as f:
                    data = f.read()
                f, handles = _get_handle(self.path_or_buf, self.mode,
                                         encoding=encoding,
                                         compression=self.compression)
                f.write(data)
                close = True
            if close:
                f.close()
Пример #6
0
def test_compression_size_fh(obj, method, compression_only):

    with tm.ensure_clean() as filename:
        f, _handles = _get_handle(filename, 'w', compression=compression_only)
        with f:
            getattr(obj, method)(f)
            assert not f.closed
        assert f.closed
        compressed = os.path.getsize(filename)
    with tm.ensure_clean() as filename:
        f, _handles = _get_handle(filename, 'w', compression=None)
        with f:
            getattr(obj, method)(f)
            assert not f.closed
        assert f.closed
        uncompressed = os.path.getsize(filename)
        assert uncompressed > compressed
Пример #7
0
def test_compression_size_fh(obj, method, compression_only):
    with tm.ensure_clean() as path:
        f, handles = icom._get_handle(path, 'w', compression=compression_only)
        with catch_to_csv_depr():
            with f:
                getattr(obj, method)(f)
                assert not f.closed
            assert f.closed
            compressed_size = os.path.getsize(path)
    with tm.ensure_clean() as path:
        f, handles = icom._get_handle(path, 'w', compression=None)
        with catch_to_csv_depr():
            with f:
                getattr(obj, method)(f)
                assert not f.closed
        assert f.closed
        uncompressed_size = os.path.getsize(path)
        assert uncompressed_size > compressed_size
Пример #8
0
def test_compression_warning(compression_only):
    df = DataFrame(100 * [[0.123456, 0.234567, 0.567567],
                          [12.32112, 123123.2, 321321.2]],
                   columns=['X', 'Y', 'Z'])
    with tm.ensure_clean() as filename:
        f, _handles = _get_handle(filename, 'w', compression=compression_only)
        with tm.assert_produces_warning(RuntimeWarning,
                                        check_stacklevel=False):
            with f:
                df.to_csv(f, compression=compression_only)
Пример #9
0
 def read_wrapper(func):
     # wrapper file handle open/close operation
     f, fh = _get_handle(path, 'rb',
                         compression=compression,
                         is_text=False)
     try:
         return func(f)
     finally:
         for _f in fh:
             _f.close()
Пример #10
0
def test_compression_warning(compression_only):
    # Assert that passing a file object to to_csv while explicitly specifying a
    # compression protocol triggers a RuntimeWarning, as per GH21227.
    df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
                             [12.32112, 123123.2, 321321.2]],
                      columns=['X', 'Y', 'Z'])
    with tm.ensure_clean() as path:
        f, handles = icom._get_handle(path, 'w', compression=compression_only)
        with tm.assert_produces_warning(RuntimeWarning,
                                        check_stacklevel=False):
            with f:
                df.to_csv(f, compression=compression_only)
Пример #11
0
    def save(self):
        # create the writer & save
        if self.encoding is None:
            if compat.PY2:
                encoding = 'ascii'
            else:
                encoding = 'utf-8'
        else:
            encoding = self.encoding

        # PR 21300 uses string buffer to receive csv writing and dump into
        # file-like output with compression as option. GH 21241, 21118
        f = StringIO()
        if not is_file_like(self.path_or_buf):
            # path_or_buf is path
            path_or_buf = self.path_or_buf
        elif hasattr(self.path_or_buf, 'name'):
            # path_or_buf is file handle
            path_or_buf = self.path_or_buf.name
        else:
            # path_or_buf is file-like IO objects.
            f = self.path_or_buf
            path_or_buf = None

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            # GH 17778 handles zip compression for byte strings separately.
            buf = f.getvalue()
            if path_or_buf:
                f, handles = _get_handle(path_or_buf, self.mode,
                                         encoding=encoding,
                                         compression=self.compression)
                f.write(buf)
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #12
0
def test_compression_warning(compression_only):
    # Assert that passing a file object to to_csv while explicitly specifying a
    # compression protocol triggers a RuntimeWarning, as per GH21227.
    # Note that pytest has an issue that causes assert_produces_warning to fail
    # in Python 2 if the warning has occurred in previous tests
    # (see https://git.io/fNEBm & https://git.io/fNEBC). Hence, should this
    # test fail in just Python 2 builds, it likely indicates that other tests
    # are producing RuntimeWarnings, thereby triggering the pytest bug.
    df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
                             [12.32112, 123123.2, 321321.2]],
                      columns=['X', 'Y', 'Z'])
    with tm.ensure_clean() as path:
        f, handles = icom._get_handle(path, 'w', compression=compression_only)
        with tm.assert_produces_warning(RuntimeWarning,
                                        check_stacklevel=False):
            with f:
                df.to_csv(f, compression=compression_only)
Пример #13
0
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
            double_precision=10, force_ascii=True, date_unit='ms',
            default_handler=None, lines=False, compression='infer',
            index=True):

    if not index and orient not in ['split', 'table']:
        raise ValueError("'index=False' is only valid when 'orient' is "
                         "'split' or 'table'")

    path_or_buf = _stringify_path(path_or_buf)
    if lines and orient != 'records':
        raise ValueError(
            "'lines' keyword only valid when 'orient' is records")

    if orient == 'table' and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or 'values')
    if orient == 'table' and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj, orient=orient, date_format=date_format,
        double_precision=double_precision, ensure_ascii=force_ascii,
        date_unit=date_unit, default_handler=default_handler,
        index=index).write()

    if lines:
        s = _convert_to_line_delimits(s)

    if isinstance(path_or_buf, compat.string_types):
        fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
        try:
            fh.write(s)
        finally:
            fh.close()
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Пример #14
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0


    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        pkl.dump(obj, f, protocol=protocol)
    finally:
        for _f in fh:
            _f.close()
Пример #15
0
def to_pickle(obj, path, compression='infer'):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    """
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    try:
        pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
    finally:
        for _f in fh:
            _f.close()
Пример #16
0
    def save(self):
        # create the writer & save
        if self.encoding is None:
            if compat.PY2:
                encoding = 'ascii'
            else:
                encoding = 'utf-8'
        else:
            encoding = self.encoding

        if hasattr(self.path_or_buf, 'write'):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(self.path_or_buf, self.mode,
                                     encoding=encoding,
                                     compression=self.compression)
            close = True

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            if close:
                f.close()
Пример #17
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'wb',
                        compression=compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        f.write(pkl.dumps(obj, protocol=protocol))
    finally:
        for _f in fh:
            _f.close()
Пример #18
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'rb', compression=compression, is_text=False)

    # 1) try with cPickle
    # 2) try with the compat pickle to handle subclass changes
    # 3) pass encoding only if its not None as py2 doesn't handle the param

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pkl.load(f)
    except Exception:  # noqa: E722
        try:
            return pc.load(f, encoding=None)
        except Exception:  # noqa: E722
            if PY3:
                return pc.load(f, encoding='latin1')
            raise
    finally:
        f.close()
        for _f in fh:
            _f.close()
Пример #19
0
def read_json(path_or_buf=None,
              orient=None,
              typ='frame',
              dtype=True,
              convert_axes=True,
              convert_dates=True,
              keep_default_dates=True,
              numpy=False,
              precise_float=False,
              date_unit=None,
              encoding=None,
              lines=False):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    See Also
    --------
    DataFrame.to_json

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d

    Encoding with Table Schema

    >>> df.to_json(orient='table')
    '{"schema": {"fields": [{"name": "index", "type": "string"},
                            {"name": "col 1", "type": "string"},
                            {"name": "col 2", "type": "string"}],
                    "primaryKey": "index",
                    "pandas_version": "0.20.0"},
        "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
                {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                      encoding=encoding)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError, ValueError):
            exists = False

        if exists:
            fh, handles = _get_handle(filepath_or_buffer,
                                      'r',
                                      encoding=encoding)
            json = fh.read()
            fh.close()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    if lines:
        # If given a json lines file, we break the string into lines, add
        # commas and put it in a json list to make a valid json object.
        lines = list(StringIO(json.strip()))
        json = '[' + ','.join(lines) + ']'

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj
Пример #20
0
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None, encoding=None,
              lines=False):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    See Also
    --------
    DataFrame.to_json

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                      encoding=encoding)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError, ValueError):
            exists = False

        if exists:
            fh, handles = _get_handle(filepath_or_buffer, 'r',
                                      encoding=encoding)
            json = fh.read()
            fh.close()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    if lines:
        # If given a json lines file, we break the string into lines, add
        # commas and put it in a json list to make a valid json object.
        lines = list(StringIO(json.strip()))
        json = u'[' + u','.join(lines) + u']'

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj
Пример #21
0
def to_pickle(obj,
              path,
              compression="infer",
              protocol=pickle.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, "wb", compression=compression, is_text=False)
    if protocol < 0:
        protocol = pickle.HIGHEST_PROTOCOL
    try:
        f.write(pickle.dumps(obj, protocol=protocol))
    finally:
        f.close()
        for _f in fh:
            _f.close()
Пример #22
0
def read_pickle(path, compression="infer"):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Notes
    -----
    read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, "rb", compression=compression, is_text=False)

    # 1) try standard library Pickle
    # 2) try pickle_compat (older pandas version) to handle subclass changes
    # 3) try pickle_compat with latin1 encoding

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pickle.load(f)
    except Exception:
        try:
            return pc.load(f, encoding=None)
        except Exception:
            return pc.load(f, encoding="latin1")
    finally:
        f.close()
        for _f in fh:
            _f.close()
Пример #23
0
def to_json(
    path_or_buf,
    obj,
    orient: Optional[str] = None,
    date_format: str = "epoch",
    double_precision: int = 10,
    force_ascii: bool = True,
    date_unit: str = "ms",
    default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
    lines: bool = False,
    compression: Optional[str] = "infer",
    index: bool = True,
    indent: int = 0,
):

    if not index and orient not in ["split", "table"]:
        raise ValueError(
            "'index=False' is only valid when 'orient' is " "'split' or 'table'"
        )

    path_or_buf = _stringify_path(path_or_buf)
    if lines and orient != "records":
        raise ValueError("'lines' keyword only valid when 'orient' is records")

    if orient == "table" and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or "values")

    writer: Type["Writer"]
    if orient == "table" and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj,
        orient=orient,
        date_format=date_format,
        double_precision=double_precision,
        ensure_ascii=force_ascii,
        date_unit=date_unit,
        default_handler=default_handler,
        index=index,
        indent=indent,
    ).write()

    if lines:
        s = convert_to_line_delimits(s)

    if isinstance(path_or_buf, str):
        fh, handles = _get_handle(path_or_buf, "w", compression=compression)
        try:
            fh.write(s)
        finally:
            fh.close()
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Пример #24
0
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None, encoding=None,
              lines=False):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient

        * `Series`

          - default is ``'index'``
          - allowed values are: ``{'split','records','index'}``
          - The Series index must be unique for orient ``'index'``.

        * `DataFrame`

          - default is ``'columns'``
          - allowed values are: {'split','records','index','columns','values'}
          - The DataFrame index must be unique for orients 'index' and
            'columns'.
          - The DataFrame columns must be unique for orients 'index',
            'columns', and 'records'.

        * The format of the JSON string

          - split : dict like
            ``{index -> [index], columns -> [columns], data -> [values]}``
          - records : list like
            ``[{column -> value}, ... , {column -> value}]``
          - index : dict like ``{index -> {column -> value}}``
          - columns : dict like ``{column -> {index -> value}}``
          - values : just the values array

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    Returns
    -------
    result : Series or DataFrame
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                      encoding=encoding)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError, ValueError):
            exists = False

        if exists:
            with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
                json = fh.read()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    if lines:
        # If given a json lines file, we break the string into lines, add
        # commas and put it in a json list to make a valid json object.
        lines = list(StringIO(json.strip()))
        json = u'[' + u','.join(lines) + u']'

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj
Пример #25
0
    def save(self):
        """
        Create the writer & save
        """
        # GH21227 internal compression is not used when file-like passed.
        if self.compression and hasattr(self.path_or_buf, "write"):
            msg = "compression has no effect when passing file-like " "object as input."
            warnings.warn(msg, RuntimeWarning, stacklevel=2)

        # when zip compression is called.
        is_zip = isinstance(self.path_or_buf,
                            ZipFile) or (not hasattr(self.path_or_buf, "write")
                                         and self.compression == "zip")

        if is_zip:
            # zipfile doesn't support writing string to archive. uses string
            # buffer to receive csv writing and dump into zip compression
            # file handle. GH21241, GH21118
            f = StringIO()
            close = False
        elif hasattr(self.path_or_buf, "write"):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(
                self.path_or_buf,
                self.mode,
                encoding=self.encoding,
                compression=self.compression,
            )
            close = True

        try:
            writer_kwargs = dict(
                lineterminator=self.line_terminator,
                delimiter=self.sep,
                quoting=self.quoting,
                doublequote=self.doublequote,
                escapechar=self.escapechar,
                quotechar=self.quotechar,
            )
            if self.encoding == "ascii":
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs["encoding"] = self.encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            if is_zip:
                # GH17778 handles zip compression separately.
                buf = f.getvalue()
                if hasattr(self.path_or_buf, "write"):
                    self.path_or_buf.write(buf)
                else:
                    f, handles = _get_handle(
                        self.path_or_buf,
                        self.mode,
                        encoding=self.encoding,
                        compression=self.compression,
                    )
                    f.write(buf)
                    close = True
            if close:
                f.close()
                for _fh in handles:
                    _fh.close()