Пример #1
0
    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

            # setting wouldn't truncate
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)
Пример #2
0
    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=[1, 2, 3])
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=list('ABC'))
        df.info(buf=buf)
        assert '+' in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), range(3)]))
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), ['foo', 'bar']]))
        df.info(buf=buf)
        assert '+' in buf.getvalue()
Пример #3
0
    def test_to_csv_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC)

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)

        # quoting windows line terminators, presents with encoding?
        # #3503
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        self.assertEqual(buf.getvalue(), text)

        # testing if quoting parameter is passed through with multi-indexes
        # related to issue #7791
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
Пример #4
0
    class UnicodeWriter(object):

        """
        A CSV writer which will write rows to CSV file "f",
        which is encoded in the given encoding.
        """

        def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
            # Redirect output to a queue
            self.queue = StringIO()
            self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
            self.stream = f
            self.encoder = codecs.getincrementalencoder(encoding)()
            self.quoting = kwds.get("quoting", None)

        def writerow(self, row):
            def _check_as_is(x):
                return (self.quoting == csv.QUOTE_NONNUMERIC and
                        is_number(x)) or isinstance(x, str)

            row = [x if _check_as_is(x)
                   else pprint_thing(x).encode("utf-8") for x in row]

            self.writer.writerow([s for s in row])
            # Fetch UTF-8 output from the queue ...
            data = self.queue.getvalue()
            data = data.decode("utf-8")
            # ... and re-encode it into the target encoding
            data = self.encoder.encode(data)
            # write to the target stream
            self.stream.write(data)
            # empty queue
            self.queue.truncate(0)

        def writerows(self, rows):
            def _check_as_is(x):
                return (self.quoting == csv.QUOTE_NONNUMERIC and
                        is_number(x)) or isinstance(x, str)

            for i, row in enumerate(rows):
                rows[i] = [x if _check_as_is(x)
                           else pprint_thing(x).encode("utf-8") for x in row]

            self.writer.writerows([[s for s in row] for row in rows])
            # Fetch UTF-8 output from the queue ...
            data = self.queue.getvalue()
            data = data.decode("utf-8")
            # ... and re-encode it into the target encoding
            data = self.encoder.encode(data)
            # write to the target stream
            self.stream.write(data)
            # empty queue
            self.queue.truncate(0)
Пример #5
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        self.assertIsNone(retval)
        self.assertEqual(buf.getvalue(), s)

        tm.assertIsInstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '%.1f' % x})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Пример #6
0
    def _coef_table(self):
        buf = StringIO()

        buf.write('%14s %10s %10s %10s %10s %10s %10s\n' %
                  ('Variable', 'Coef', 'Std Err', 't-stat',
                   'p-value', 'CI 2.5%', 'CI 97.5%'))
        buf.write(scom.banner(''))
        coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f'

        results = self._results

        beta = results['beta']

        for i, name in enumerate(beta.index):
            if i and not (i % 5):
                buf.write('\n' + scom.banner(''))

            std_err = results['std_err'][name]
            CI1 = beta[name] - 1.96 * std_err
            CI2 = beta[name] + 1.96 * std_err

            t_stat = results['t_stat'][name]
            p_value = results['p_value'][name]

            line = coef_template % (name,
                                    beta[name], std_err, t_stat, p_value, CI1, CI2)

            buf.write(line)

        if self.nw_lags is not None:
            buf.write('\n')
            buf.write('*** The calculations are Newey-West '
                      'adjusted with lags %5d\n' % self.nw_lags)

        return buf.getvalue()
Пример #7
0
    def _coef_table(self):
        buffer = StringIO()
        buffer.write(
            "%13s %13s %13s %13s %13s %13s\n" % ("Variable", "Beta", "Std Err", "t-stat", "CI 2.5%", "CI 97.5%")
        )
        template = "%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n"

        for i, name in enumerate(self._cols):
            if i and not (i % 5):
                buffer.write("\n" + common.banner(""))

            mean_beta = self._results["mean_beta"][i]
            std_beta = self._results["std_beta"][i]
            t_stat = self._results["t_stat"][i]
            ci1 = mean_beta - 1.96 * std_beta
            ci2 = mean_beta + 1.96 * std_beta

            values = "(%s)" % name, mean_beta, std_beta, t_stat, ci1, ci2

            buffer.write(template % values)

        if self._nw_lags_beta is not None:
            buffer.write("\n")
            buffer.write("*** The Std Err, t-stat are Newey-West " "adjusted with Lags %5d\n" % self._nw_lags_beta)

        return buffer.getvalue()
Пример #8
0
    def save(self):
        """
        Create the writer & save
        """
        # GH21227 internal compression is not used when file-like passed.
        if self.compression and hasattr(self.path_or_buf, 'write'):
            msg = ("compression has no effect when passing file-like "
                   "object as input.")
            warnings.warn(msg, RuntimeWarning, stacklevel=2)

        # when zip compression is called.
        is_zip = isinstance(self.path_or_buf, ZipFile) or (
            not hasattr(self.path_or_buf, 'write')
            and self.compression == 'zip')

        if is_zip:
            # zipfile doesn't support writing string to archive. uses string
            # buffer to receive csv writing and dump into zip compression
            # file handle. GH21241, GH21118
            f = StringIO()
            close = False
        elif hasattr(self.path_or_buf, 'write'):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(self.path_or_buf, self.mode,
                                     encoding=self.encoding,
                                     compression=self.compression)
            close = True

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if self.encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = self.encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            if is_zip:
                # GH17778 handles zip compression separately.
                buf = f.getvalue()
                if hasattr(self.path_or_buf, 'write'):
                    self.path_or_buf.write(buf)
                else:
                    f, handles = _get_handle(self.path_or_buf, self.mode,
                                             encoding=self.encoding,
                                             compression=self.compression)
                    f.write(buf)
                    close = True
            if close:
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #9
0
    def test_to_html(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))

        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan
        s = biggie.to_html()

        buf = StringIO()
        retval = biggie.to_html(buf=buf)
        assert retval is None
        assert buf.getvalue() == s

        assert isinstance(s, compat.string_types)

        biggie.to_html(columns=['B', 'A'], col_space=17)
        biggie.to_html(columns=['B', 'A'],
                       formatters={'A': lambda x: '{x:.1f}'.format(x=x)})

        biggie.to_html(columns=['B', 'A'], float_format=str)
        biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str)

        frame = DataFrame(index=np.arange(200))
        frame.to_html()
Пример #10
0
    def _coef_table(self):
        buf = StringIO()

        buf.write(
            "%14s %10s %10s %10s %10s %10s %10s\n"
            % ("Variable", "Coef", "Std Err", "t-stat", "p-value", "CI 2.5%", "CI 97.5%")
        )
        buf.write(scom.banner(""))
        coef_template = "\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f"

        results = self._results

        beta = results["beta"]

        for i, name in enumerate(beta.index):
            if i and not (i % 5):
                buf.write("\n" + scom.banner(""))

            std_err = results["std_err"][name]
            CI1 = beta[name] - 1.96 * std_err
            CI2 = beta[name] + 1.96 * std_err

            t_stat = results["t_stat"][name]
            p_value = results["p_value"][name]

            line = coef_template % (name, beta[name], std_err, t_stat, p_value, CI1, CI2)

            buf.write(line)

        if self.nw_lags is not None:
            buf.write("\n")
            buf.write("*** The calculations are Newey-West " "adjusted with lags %5d\n" % self.nw_lags)

        return buf.getvalue()
Пример #11
0
    def _coef_table(self):
        buffer = StringIO()
        buffer.write('%13s %13s %13s %13s %13s %13s\n' %
                     ('Variable', 'Beta', 'Std Err', 't-stat', 'CI 2.5%', 'CI 97.5%'))
        template = '%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n'

        for i, name in enumerate(self._cols):
            if i and not (i % 5):
                buffer.write('\n' + common.banner(''))

            mean_beta = self._results['mean_beta'][i]
            std_beta = self._results['std_beta'][i]
            t_stat = self._results['t_stat'][i]
            ci1 = mean_beta - 1.96 * std_beta
            ci2 = mean_beta + 1.96 * std_beta

            values = '(%s)' % name, mean_beta, std_beta, t_stat, ci1, ci2

            buffer.write(template % values)

        if self._nw_lags_beta is not None:
            buffer.write('\n')
            buffer.write('*** The Std Err, t-stat are Newey-West '
                         'adjusted with Lags %5d\n' % self._nw_lags_beta)

        return buffer.getvalue()
Пример #12
0
    def test_to_csv_numpy_16_bug(self):
        frame = DataFrame({'a': date_range('1/1/2000', periods=10)})

        buf = StringIO()
        frame.to_csv(buf)

        result = buf.getvalue()
        self.assertIn('2000-01-01', result)
Пример #13
0
def to_clipboard(obj, excel=True, sep=None, **kwargs):  # pragma: no cover
    """
    Attempt to write text representation of object to the system clipboard
    The clipboard can be then pasted into Excel for example.

    Parameters
    ----------
    obj : the object to write to the clipboard
    excel : boolean, defaults to True
            if True, use the provided separator, writing in a csv
            format for allowing easy pasting into excel.
            if False, write a string representation of the object
            to the clipboard
    sep : optional, defaults to tab
    other keywords are passed to to_csv

    Notes
    -----
    Requirements for your platform
      - Linux: xclip, or xsel (with gtk or PyQt4 modules)
      - Windows:
      - OS X:
    """
    encoding = kwargs.pop('encoding', 'utf-8')

    # testing if an invalid encoding is passed to clipboard
    if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
        raise ValueError('clipboard only supports utf-8 encoding')

    from pandas.io.clipboard import clipboard_set
    if excel is None:
        excel = True

    if excel:
        try:
            if sep is None:
                sep = '\t'
            buf = StringIO()
            # clipboard_set (pyperclip) expects unicode
            obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
            text = buf.getvalue()
            if PY2:
                text = text.decode('utf-8')
            clipboard_set(text)
            return
        except TypeError:
            warnings.warn('to_clipboard in excel mode requires a single '
                          'character separator.')
    elif sep is not None:
        warnings.warn('to_clipboard with excel=False ignores the sep argument')

    if isinstance(obj, ABCDataFrame):
        # str(df) has various unhelpful defaults, like truncation
        with option_context('display.max_colwidth', 999999):
            objstr = obj.to_string(**kwargs)
    else:
        objstr = str(obj)
    clipboard_set(objstr)
Пример #14
0
 def test_to_csv_quote_none(self):
     # GH4328
     df = DataFrame({'A': ['hello', '{"hello"}']})
     for encoding in (None, 'utf-8'):
         buf = StringIO()
         df.to_csv(buf, quoting=csv.QUOTE_NONE,
                   encoding=encoding, index=False)
         result = buf.getvalue()
         expected = 'A\nhello\n{"hello"}\n'
         self.assertEqual(result, expected)
Пример #15
0
    def test_to_csv_line_terminators(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, line_terminator='\r\n')
        expected = (',A,B\r\n'
                    'one,1,4\r\n'
                    'two,2,5\r\n'
                    'three,3,6\r\n')
        self.assertEqual(buf.getvalue(), expected)

        buf = StringIO()
        df.to_csv(buf)  # The default line terminator remains \n
        expected = (',A,B\n'
                    'one,1,4\n'
                    'two,2,5\n'
                    'three,3,6\n')
        self.assertEqual(buf.getvalue(), expected)
Пример #16
0
    def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output as when one
        # would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        res = StringIO()
        s.to_csv(res)
        exp = StringIO()
        s2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})
        res = StringIO()
        df.to_csv(res)
        exp = StringIO()
        df2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())
Пример #17
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)
        expected = ('A,B\n'
                    'one,1,4\n'
                    'two,2,5\n'
                    'three,3,6\n')
        self.assertEqual(buf.getvalue(), expected)
Пример #18
0
def profiled():
    pr = cProfile.Profile()
    pr.enable()
    yield
    pr.disable()
    s = StringIO()
    ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
    ps.print_stats()
    # uncomment this to see who's calling what
    # ps.print_callers()
    print(s.getvalue())
Пример #19
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)
        expected = ('A,B\n'
                    'one,1,4\n'
                    'two,2,5\n'
                    'three,3,6\n')
        assert buf.getvalue() == expected
Пример #20
0
 def test_to_csv_quote_none(self):
     # GH4328
     df = DataFrame({'A': ['hello', '{"hello"}']})
     for encoding in (None, 'utf-8'):
         buf = StringIO()
         df.to_csv(buf,
                   quoting=csv.QUOTE_NONE,
                   encoding=encoding,
                   index=False)
         result = buf.getvalue()
         expected = 'A\nhello\n{"hello"}\n'
         self.assertEqual(result, expected)
Пример #21
0
def test_to_csv_gcs(mock):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
        s = StringIO()
        instance = MockFileSystem.return_value
        instance.open.return_value = s

        df1.to_csv('gs://test/test.csv', index=True)
        df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Пример #22
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)

        expected_rows = ['A,B',
                         'one,1,4',
                         'two,2,5',
                         'three,3,6']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert buf.getvalue() == expected
Пример #23
0
    def test_to_csv_index_no_leading_comma(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
                       index=['one', 'two', 'three'])

        buf = StringIO()
        df.to_csv(buf, index_label=False)

        expected_rows = ['A,B',
                         'one,1,4',
                         'two,2,5',
                         'three,3,6']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert buf.getvalue() == expected
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf,
                  index=False,
                  quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert result == expected
Пример #25
0
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf,
                  index=False,
                  quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n')

        self.assertEqual(result, expected)
Пример #26
0
 def test_info_shows_column_dtypes(self):
     dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"]
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = "%d    %d non-null %s" % (i, n, dtype)
         assert name in res
Пример #27
0
    def save(self):
        # create the writer & save
        if self.encoding is None:
            if compat.PY2:
                encoding = 'ascii'
            else:
                encoding = 'utf-8'
        else:
            encoding = self.encoding

        # PR 21300 uses string buffer to receive csv writing and dump into
        # file-like output with compression as option. GH 21241, 21118
        f = StringIO()
        if not is_file_like(self.path_or_buf):
            # path_or_buf is path
            path_or_buf = self.path_or_buf
        elif hasattr(self.path_or_buf, 'name'):
            # path_or_buf is file handle
            path_or_buf = self.path_or_buf.name
        else:
            # path_or_buf is file-like IO objects.
            f = self.path_or_buf
            path_or_buf = None

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep,
                                 quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            # GH 17778 handles zip compression for byte strings separately.
            buf = f.getvalue()
            if path_or_buf:
                f, handles = _get_handle(path_or_buf,
                                         self.mode,
                                         encoding=encoding,
                                         compression=self.compression)
                f.write(buf)
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #28
0
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected_rows = ['"A","B"',
                         '1,"foo"',
                         '2,"bar"',
                         '3,"baz"']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert result == expected
Пример #29
0
    def test_to_csv_quote_none(self):
        # GH4328
        df = DataFrame({'A': ['hello', '{"hello"}']})
        for encoding in (None, 'utf-8'):
            buf = StringIO()
            df.to_csv(buf, quoting=csv.QUOTE_NONE,
                      encoding=encoding, index=False)

            result = buf.getvalue()
            expected_rows = ['A',
                             'hello',
                             '{"hello"}']
            expected = tm.convert_rows_list_to_csv_str(expected_rows)
            assert result == expected
Пример #30
0
    def test_to_csv_quote_none(self):
        # GH4328
        df = DataFrame({'A': ['hello', '{"hello"}']})
        for encoding in (None, 'utf-8'):
            buf = StringIO()
            df.to_csv(buf, quoting=csv.QUOTE_NONE,
                      encoding=encoding, index=False)

            result = buf.getvalue()
            expected_rows = ['A',
                             'hello',
                             '{"hello"}']
            expected = tm.convert_rows_list_to_csv_str(expected_rows)
            assert result == expected
Пример #31
0
def test_to_csv_gcs(monkeypatch):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    s = StringIO()

    class MockGCSFileSystem():
        def open(*args):
            return s

    monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
    df1.to_csv('gs://test/example.csv', index=True)
    df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Пример #32
0
    def test_to_csv_unicodewriter_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
                  encoding='utf-8')

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)
Пример #33
0
 def test_info_shows_column_dtypes(self):
     dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
               'complex128', 'object', 'bool']
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = '%d    %d non-null %s' % (i, n, dtype)
         assert name in res
Пример #34
0
def test_to_csv_gcs(monkeypatch):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    s = StringIO()

    class MockGCSFileSystem():
        def open(*args):
            return s

    monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
    df1.to_csv('gs://test/test.csv', index=True)
    df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Пример #35
0
    def test_repr_bool_fails(self):
        s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)])

        import sys

        buf = StringIO()
        tmp = sys.stderr
        sys.stderr = buf
        try:
            # it works (with no Cython exception barf)!
            repr(s)
        finally:
            sys.stderr = tmp
        self.assertEqual(buf.getvalue(), '')
Пример #36
0
    def save(self):
        # create the writer & save
        if self.encoding is None:
            if compat.PY2:
                encoding = 'ascii'
            else:
                encoding = 'utf-8'
        else:
            encoding = self.encoding

        # PR 21300 uses string buffer to receive csv writing and dump into
        # file-like output with compression as option. GH 21241, 21118
        f = StringIO()
        if not is_file_like(self.path_or_buf):
            # path_or_buf is path
            path_or_buf = self.path_or_buf
        elif hasattr(self.path_or_buf, 'name'):
            # path_or_buf is file handle
            path_or_buf = self.path_or_buf.name
        else:
            # path_or_buf is file-like IO objects.
            f = self.path_or_buf
            path_or_buf = None

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep, quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            # GH 17778 handles zip compression for byte strings separately.
            buf = f.getvalue()
            if path_or_buf:
                f, handles = _get_handle(path_or_buf, self.mode,
                                         encoding=encoding,
                                         compression=self.compression)
                f.write(buf)
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #37
0
    def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output
        # as when one would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        res = StringIO()

        s.to_csv(res, header=False)
        exp = StringIO()

        s2.to_csv(exp, header=False)
        assert res.getvalue() == exp.getvalue()

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})

        res = StringIO()
        df.to_csv(res)

        exp = StringIO()
        df2.to_csv(exp)

        assert res.getvalue() == exp.getvalue()
Пример #38
0
    def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output
        # as when one would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        res = StringIO()

        s.to_csv(res, header=False)
        exp = StringIO()

        s2.to_csv(exp, header=False)
        assert res.getvalue() == exp.getvalue()

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})

        res = StringIO()
        df.to_csv(res)

        exp = StringIO()
        df2.to_csv(exp)

        assert res.getvalue() == exp.getvalue()
Пример #39
0
def _get_pretty_string(obj):
    """Return a prettier version of obj

    Parameters
    ----------
    obj : object
        Object to pretty print

    Returns
    -------
    s : str
        Pretty print object repr
    """
    sio = StringIO()
    pprint.pprint(obj, stream=sio)
    return sio.getvalue()
Пример #40
0
def _get_pretty_string(obj):
    """Return a prettier version of obj

    Parameters
    ----------
    obj : object
        Object to pretty print

    Returns
    -------
    s : str
        Pretty print object repr
    """
    sio = StringIO()
    pprint.pprint(obj, stream=sio)
    return sio.getvalue()
Пример #41
0
def test_to_csv_gcs(mock):
    df1 = DataFrame({
        'int': [1, 3],
        'float': [2.0, np.nan],
        'str': ['t', 's'],
        'dt': date_range('2018-06-18', periods=2)
    })
    with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
        s = StringIO()
        instance = MockFileSystem.return_value
        instance.open.return_value = s

        df1.to_csv('gs://test/test.csv', index=True)
        df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)
Пример #42
0
    def test_info_wide(self):
        from pandas import set_option, reset_option
        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        self.assertTrue(len(rs.splitlines()) > 100)
        xp = rs

        set_option('display.max_info_columns', 101)
        io = StringIO()
        df.info(buf=io)
        self.assertEqual(rs, xp)
        reset_option('display.max_info_columns')
Пример #43
0
    def test_info_wide(self):
        from pandas import set_option, reset_option
        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        self.assertTrue(len(rs.splitlines()) > 100)
        xp = rs

        set_option('display.max_info_columns', 101)
        io = StringIO()
        df.info(buf=io)
        self.assertEqual(rs, xp)
        reset_option('display.max_info_columns')
Пример #44
0
    def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv",
                                                     Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
Пример #45
0
def to_clipboard(obj, excel=None, sep=None, **kwargs):  # pragma: no cover
    """
    Attempt to write text representation of object to the system clipboard
    The clipboard can be then pasted into Excel for example.

    Parameters
    ----------
    obj : the object to write to the clipboard
    excel : boolean, defaults to True
            if True, use the provided separator, writing in a csv
            format for allowing easy pasting into excel.
            if False, write a string representation of the object
            to the clipboard
    sep : optional, defaults to tab
    other keywords are passed to to_csv

    Notes
    -----
    Requirements for your platform
      - Linux: xclip, or xsel (with gtk or PyQt4 modules)
      - Windows:
      - OS X:
    """
    from pandas.util.clipboard import clipboard_set
    if excel is None:
        excel = True

    if excel:
        try:
            if sep is None:
                sep = '\t'
            buf = StringIO()
            obj.to_csv(buf, sep=sep, **kwargs)
            clipboard_set(buf.getvalue())
            return
        except:
            pass

    if isinstance(obj, DataFrame):
        # str(df) has various unhelpful defaults, like truncation
        with option_context('display.max_colwidth', 999999):
            objstr = obj.to_string(**kwargs)
    else:
        objstr = str(obj)
    clipboard_set(objstr)
    def test_info_memory(self):
        # https://github.com/pandas-dev/pandas/issues/21056
        df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
        buf = StringIO()
        df.info(buf=buf)
        result = buf.getvalue()
        bytes = float(df.memory_usage().sum())

        expected = textwrap.dedent("""\
        <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 2 entries, 0 to 1
        Data columns (total 1 columns):
        a    2 non-null int64
        dtypes: int64(1)
        memory usage: {} bytes
        """.format(bytes))

        assert result == expected
Пример #47
0
def test_to_html(biggie_df_fixture):
    # TODO: split this test
    df = biggie_df_fixture
    s = df.to_html()

    buf = StringIO()
    retval = df.to_html(buf=buf)
    assert retval is None
    assert buf.getvalue() == s

    assert isinstance(s, str)

    df.to_html(columns=['B', 'A'], col_space=17)
    df.to_html(columns=['B', 'A'],
               formatters={'A': lambda x: '{x:.1f}'.format(x=x)})

    df.to_html(columns=['B', 'A'], float_format=str)
    df.to_html(columns=['B', 'A'], col_space=12, float_format=str)
Пример #48
0
def predict(args):
    predict_curl = pycurl.Curl()
    storage = StringIO()
    values = [(args.input_node_name, (pycurl.FORM_FILE, args.file_path))]
    predict_curl.setopt(predict_curl.URL, args.server_url)
    predict_curl.setopt(predict_curl.WRITEFUNCTION, storage.write)
    predict_curl.setopt(predict_curl.HTTPPOST, values)
    predict_curl.perform()
    predict_curl.close()
    content = storage.getvalue()
    content = content.replace('\n', '')
    res = json.loads(content)

    if os.path.isfile(args.label_file_path):
        labels = [
            line.strip() for line in open(args.label_file_path).readlines()
        ]
    if labels is not None:
        for item in res["prediction"]:
            item['class'] = labels[int(item['class'])]
    print res
Пример #49
0
    def test_verbose_import(self):
        text = """a,b,c,d
one,1,2,3
one,1,2,3
,1,2,3
one,1,2,3
,1,2,3
,1,2,3
one,1,2,3
two,1,2,3"""

        buf = StringIO()
        sys.stdout = buf

        try:  # engines are verbose in different ways
            self.read_csv(StringIO(text), verbose=True)
            if self.engine == 'c':
                self.assertIn('Tokenization took:', buf.getvalue())
                self.assertIn('Parser memory cleanup took:', buf.getvalue())
            else:  # Python engine
                self.assertEqual(buf.getvalue(),
                                 'Filled 3 NA values in column a\n')
        finally:
            sys.stdout = sys.__stdout__

        buf = StringIO()
        sys.stdout = buf

        text = """a,b,c,d
one,1,2,3
two,1,2,3
three,1,2,3
four,1,2,3
five,1,2,3
,1,2,3
seven,1,2,3
eight,1,2,3"""

        try:  # engines are verbose in different ways
            self.read_csv(StringIO(text), verbose=True, index_col=0)
            if self.engine == 'c':
                self.assertIn('Tokenization took:', buf.getvalue())
                self.assertIn('Parser memory cleanup took:', buf.getvalue())
            else:  # Python engine
                self.assertEqual(buf.getvalue(),
                                 'Filled 1 NA values in column a\n')
        finally:
            sys.stdout = sys.__stdout__
Пример #50
0
def to_clipboard(obj, excel=None, sep=None, **kwargs):  # pragma: no cover
    """
    Attempt to write text representation of object to the system clipboard
    The clipboard can be then pasted into Excel for example.

    Parameters
    ----------
    obj : the object to write to the clipboard
    excel : boolean, defaults to True
            if True, use the provided separator, writing in a csv
            format for allowing easy pasting into excel.
            if False, write a string representation of the object
            to the clipboard
    sep : optional, defaults to tab
    other keywords are passed to to_csv

    Notes
    -----
    Requirements for your platform
      - Linux: xclip, or xsel (with gtk or PyQt4 modules)
      - Windows:
      - OS X:
    """
    from pandas.util.clipboard import clipboard_set
    if excel is None:
        excel = True

    if excel:
        try:
            if sep is None:
                sep = '\t'
            buf = StringIO()
            obj.to_csv(buf, sep=sep, **kwargs)
            clipboard_set(buf.getvalue())
            return
        except:
            pass

    clipboard_set(str(obj))
Пример #51
0
    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = [
            'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
            'complex128', 'object', 'bool'
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert "memory usage: " in res[-1]

        # do not display memory usage case
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        assert "memory usage: " not in res[-1]

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        assert not re.match(r"memory usage: [^+]+\+", res[-1])

        # POJO.Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+$", res[-1])

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        assert df_size == exp_size

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        assert size_df == np.size(df.memory_usage())

        # assert deep works only on object
        assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()

        # test for validity
        DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']).index.nbytes
        df = DataFrame(data=1,
                       index=pd.MultiIndex.from_product([['a'],
                                                         range(1000)]),
                       columns=['A'])
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        mem = df.memory_usage(deep=True).sum()
        assert mem > 0
Пример #52
0
    def test_to_csv_quoting(self):
        df = DataFrame({
            'c_bool': [True, False],
            'c_float': [1.0, 3.2],
            'c_int': [42, np.nan],
            'c_string': ['a', 'b,c'],
        })

        expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,"b,c"
"""
        result = df.to_csv()
        assert result == expected

        result = df.to_csv(quoting=None)
        assert result == expected

        result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
        assert result == expected

        expected = """\
"","c_bool","c_float","c_int","c_string"
"0","True","1.0","42.0","a"
"1","False","3.2","","b,c"
"""
        result = df.to_csv(quoting=csv.QUOTE_ALL)
        assert result == expected

        # see gh-12922, gh-13259: make sure changes to
        # the formatters do not break this behaviour
        expected = """\
"","c_bool","c_float","c_int","c_string"
0,True,1.0,42.0,"a"
1,False,3.2,"","b,c"
"""
        result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
        assert result == expected

        msg = "need to escape, but no escapechar set"
        tm.assert_raises_regex(csv.Error, msg, df.to_csv,
                               quoting=csv.QUOTE_NONE)
        tm.assert_raises_regex(csv.Error, msg, df.to_csv,
                               quoting=csv.QUOTE_NONE,
                               escapechar=None)

        expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,b!,c
"""
        result = df.to_csv(quoting=csv.QUOTE_NONE,
                           escapechar='!')
        assert result == expected

        expected = """\
,c_bool,c_ffloat,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,bf,c
"""
        result = df.to_csv(quoting=csv.QUOTE_NONE,
                           escapechar='f')
        assert result == expected

        # see gh-3503: quoting Windows line terminators
        # presents with encoding?
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        assert buf.getvalue() == text

        # xref gh-7791: make sure the quoting parameter is passed through
        # with multi-indexes
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
Пример #53
0
    def summary(self):
        """
        This returns the formatted result of the OLS computation
        """
        template = """
%(bannerTop)s

Formula: Y ~ %(formula)s

Number of Observations:         %(nobs)d
Number of Degrees of Freedom:   %(df)d

R-squared:     %(r2)10.4f
Adj R-squared: %(r2_adj)10.4f

Rmse:          %(rmse)10.4f

F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f

Degrees of Freedom: model %(df_model)d, resid %(df_resid)d

%(bannerCoef)s
%(coef_table)s
%(bannerEnd)s
"""
        coef_table = self._coef_table

        results = self._results

        f_stat = results['f_stat']

        bracketed = ['<%s>' % str(c) for c in results['beta'].index]

        formula = StringIO()
        formula.write(bracketed[0])
        tot = len(bracketed[0])
        line = 1
        for coef in bracketed[1:]:
            tot = tot + len(coef) + 3

            if tot // (68 * line):
                formula.write('\n' + ' ' * 12)
                line += 1

            formula.write(' + ' + coef)

        params = {
            'bannerTop': scom.banner('Summary of Regression Analysis'),
            'bannerCoef': scom.banner('Summary of Estimated Coefficients'),
            'bannerEnd': scom.banner('End of Summary'),
            'formula': formula.getvalue(),
            'r2': results['r2'],
            'r2_adj': results['r2_adj'],
            'nobs': results['nobs'],
            'df': results['df'],
            'df_model': results['df_model'],
            'df_resid': results['df_resid'],
            'coef_table': coef_table,
            'rmse': results['rmse'],
            'f_stat': f_stat['f-stat'],
            'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']),
            'f_stat_p_value': f_stat['p-value'],
        }

        return template % params
visualizer = PredictionError(RandomForestRegressor())
# Fit
visualizer.fit(X_train, y_train)
# Score and visualize
visualizer.score(X_test, y_test)
visualizer.poof()

from sklearn import tree
from IPython.core.display import Image
from pandas.compat import StringIO

import pydotplus
# Visualize tree
dot_data = StringIO()
tree.export_graphviz(model.estimators_[0], out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
image = graph.write("random_network")

from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor()
model.fit(X_train, y_train)
yhat = model.predict(X_test)
r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)
print("r2={:0.3f} MSE={:0.3f}".format(r2, me))

from yellowbrick.regressor import PredictionError
# Instantiate the visualizer
visualizer = PredictionError(AdaBoostRegressor())
# Fit
Пример #55
0
 def test_dtype_name_in_info(self, data):
     buf = StringIO()
     pd.DataFrame({"A": data}).info(buf=buf)
     result = buf.getvalue()
     assert data.dtype.name in result
Пример #56
0
    def save(self):
        """
        Create the writer & save
        """
        # GH21227 internal compression is not used when file-like passed.
        if self.compression and hasattr(self.path_or_buf, 'write'):
            msg = ("compression has no effect when passing file-like "
                   "object as input.")
            warnings.warn(msg, RuntimeWarning, stacklevel=2)

        # when zip compression is called.
        is_zip = isinstance(self.path_or_buf,
                            ZipFile) or (not hasattr(self.path_or_buf, 'write')
                                         and self.compression == 'zip')

        if is_zip:
            # zipfile doesn't support writing string to archive. uses string
            # buffer to receive csv writing and dump into zip compression
            # file handle. GH21241, GH21118
            f = StringIO()
            close = False
        elif hasattr(self.path_or_buf, 'write'):
            f = self.path_or_buf
            close = False
        else:
            f, handles = _get_handle(self.path_or_buf,
                                     self.mode,
                                     encoding=self.encoding,
                                     compression=self.compression)
            close = True

        try:
            writer_kwargs = dict(lineterminator=self.line_terminator,
                                 delimiter=self.sep,
                                 quoting=self.quoting,
                                 doublequote=self.doublequote,
                                 escapechar=self.escapechar,
                                 quotechar=self.quotechar)
            if self.encoding == 'ascii':
                self.writer = csvlib.writer(f, **writer_kwargs)
            else:
                writer_kwargs['encoding'] = self.encoding
                self.writer = UnicodeWriter(f, **writer_kwargs)

            self._save()

        finally:
            if is_zip:
                # GH17778 handles zip compression separately.
                buf = f.getvalue()
                if hasattr(self.path_or_buf, 'write'):
                    self.path_or_buf.write(buf)
                else:
                    f, handles = _get_handle(self.path_or_buf,
                                             self.mode,
                                             encoding=self.encoding,
                                             compression=self.compression)
                    f.write(buf)
                    close = True
            if close:
                f.close()
                for _fh in handles:
                    _fh.close()
Пример #57
0
    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = [
            'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
            'complex128', 'object', 'bool'
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " in res[-1])

        # do not display memory usage cas
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " not in res[-1])

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))

        self.assertGreater(
            df_with_object_index.memory_usage(index=True, deep=True).sum(),
            df_with_object_index.memory_usage(index=True).sum())

        df_object = pd.DataFrame({'a': ['a']})
        self.assertGreater(
            df_object.memory_usage(deep=True).sum(),
            df_object.memory_usage().sum())

        # Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        self.assertEqual(df_size, exp_size)

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        self.assertEqual(size_df, np.size(df.memory_usage()))

        # assert deep works only on object
        self.assertEqual(df.memory_usage().sum(),
                         df.memory_usage(deep=True).sum())

        # test for validity
        DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']).index.nbytes
        df = DataFrame(data=1,
                       index=pd.MultiIndex.from_product([['a'],
                                                         range(1000)]),
                       columns=['A'])
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
        self.assertTrue(abs(diff) < 100)
Пример #58
0
 def test_dump_to_file(self):
     f = StringIO()
     ujson.dump([1, 2, 3], f)
     assert "[1,2,3]" == f.getvalue()