def test_to_latex_filename(self, frame): with tm.ensure_clean('test.tex') as path: frame.to_latex(path) with open(path, 'r') as f: assert frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) df = DataFrame([[u'au\xdfgangen']]) with tm.ensure_clean('test.tex') as path: df.to_latex(path, encoding='utf-8') with codecs.open(path, 'r', encoding='utf-8') as f: assert df.to_latex() == f.read() # test with utf-8 without encoding option if compat.PY3: # python3: pandas default encoding is utf-8 with tm.ensure_clean('test.tex') as path: df.to_latex(path) with codecs.open(path, 'r', encoding='utf-8') as f: assert df.to_latex() == f.read() else: # python2 default encoding is ascii, so an error should be raised with tm.ensure_clean('test.tex') as path: with pytest.raises(UnicodeEncodeError): df.to_latex(path)
def test_xz(self): lzma = tm._skip_if_no_lzma() with open(self.csv1, "rb") as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = lzma.LZMAFile(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="xz") tm.assert_frame_equal(result, expected) with open(path, "rb") as f: result = self.read_csv(f, compression="xz") tm.assert_frame_equal(result, expected) with tm.ensure_clean("test.xz") as path: tmp = lzma.LZMAFile(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="infer") tm.assert_frame_equal(result, expected)
def test_to_csv_quotechar(self): df = DataFrame({'col': [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, 'r') as f: assert f.read() == expected expected = """\ $$,$col$ $0$,$1$ $1$,$2$ """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1, quotechar="$") with open(path, 'r') as f: assert f.read() == expected with tm.ensure_clean('test.csv') as path: with pytest.raises(TypeError, match='quotechar'): df.to_csv(path, quoting=1, quotechar=None)
def test_gzip(self): try: import gzip except ImportError: raise nose.SkipTest("need gzip to run") with open(self.csv1, "rb") as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="gzip") tm.assert_frame_equal(result, expected) with open(path, "rb") as f: result = self.read_csv(f, compression="gzip") tm.assert_frame_equal(result, expected) with tm.ensure_clean("test.gz") as path: tmp = gzip.GzipFile(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="infer") tm.assert_frame_equal(result, expected)
def test_to_csv_line_terminators(self): # see gh-20353 df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) with ensure_clean() as path: # case 1: CRLF as line terminator df.to_csv(path, line_terminator='\r\n') expected = b',A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n' with open(path, mode='rb') as f: assert f.read() == expected with ensure_clean() as path: # case 2: LF as line terminator df.to_csv(path, line_terminator='\n') expected = b',A,B\none,1,4\ntwo,2,5\nthree,3,6\n' with open(path, mode='rb') as f: assert f.read() == expected with ensure_clean() as path: # case 3: The default line terminator(=os.linesep)(gh-21406) df.to_csv(path) os_linesep = os.linesep.encode('utf-8') expected = (b',A,B' + os_linesep + b'one,1,4' + os_linesep + b'two,2,5' + os_linesep + b'three,3,6' + os_linesep) with open(path, mode='rb') as f: assert f.read() == expected
def test_bz2(self): try: import bz2 except ImportError: raise nose.SkipTest("need bz2 to run") with open(self.csv1, "rb") as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="bz2") tm.assert_frame_equal(result, expected) self.assertRaises(ValueError, self.read_csv, path, compression="bz3") with open(path, "rb") as fin: result = self.read_csv(fin, compression="bz2") tm.assert_frame_equal(result, expected) with tm.ensure_clean("test.bz2") as path: tmp = bz2.BZ2File(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, compression="infer") tm.assert_frame_equal(result, expected)
def test_read_infer(self, ext, get_random_path): if ext == '.xz': tm._skip_if_no_lzma() base = get_random_path path1 = base + ".raw" path2 = base + ext compression = None for c in self._compression_to_extension: if self._compression_to_extension[c] == ext: compression = c break with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() # write to uncompressed file df.to_pickle(p1, compression=None) # compress self.compress_file(p1, p2, compression=compression) # read compressed file by inferred compression method df2 = pd.read_pickle(p2) tm.assert_frame_equal(df, df2)
def test_write_infer(self, ext, get_random_path): base = get_random_path path1 = base + ext path2 = base + ".raw" compression = None for c in self._compression_to_extension: if self._compression_to_extension[c] == ext: compression = c break with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() # write to compressed file by inferred compression method df.to_pickle(p1) # decompress with tm.decompress_file(p1, compression=compression) as f: with open(p2, "wb") as fh: fh.write(f.read()) # read decompressed file df2 = pd.read_pickle(p2, compression=None) tm.assert_frame_equal(df, df2)
def test_to_csv_quotechar(self): df = DataFrame({'col': [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, 'r') as f: self.assertEqual(f.read(), expected) expected = """\ $$,$col$ $0$,$1$ $1$,$2$ """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1, quotechar="$") with open(path, 'r') as f: self.assertEqual(f.read(), expected) with tm.ensure_clean('test.csv') as path: with tm.assertRaisesRegexp(TypeError, 'quotechar'): df.to_csv(path, quoting=1, quotechar=None)
def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' # before the newline when the NaN-value is in # the first row. Otherwise, only the newline # character is added. This behavior is inconsistent # and was patched in https://bugs.python.org/pull_request4672. df1 = DataFrame([None, 1]) expected1 = """\ "" 1.0 """ with tm.ensure_clean('test.csv') as path: df1.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ with tm.ensure_clean('test.csv') as path: df2.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected2
def test_other_compression(self, compress_type, compress_method, ext): with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: with compress_method(path, mode='wb') as tmp: tmp.write(data) result = self.read_csv(path, compression=compress_type) tm.assert_frame_equal(result, expected) if compress_type == 'bz2': pytest.raises(ValueError, self.read_csv, path, compression='bz3') with open(path, 'rb') as fin: result = self.read_csv(fin, compression=compress_type) tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.{}'.format(ext)) as path: with compress_method(path, mode='wb') as tmp: tmp.write(data) result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def test_decompression_regex_sep(self): # see gh-6607 try: import gzip import bz2 except ImportError: pytest.skip('need gzip and bz2 to run') with open(self.csv1, 'rb') as f: data = f.read() data = data.replace(b',', b'::') expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, sep='::', compression='gzip') tm.assert_frame_equal(result, expected) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, sep='::', compression='bz2') tm.assert_frame_equal(result, expected) pytest.raises(ValueError, self.read_csv, path, compression='bz3')
def test_bz2(self): try: import bz2 except ImportError: raise nose.SkipTest('need bz2 to run') with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='bz2') tm.assert_frame_equal(result, expected) self.assertRaises(ValueError, self.read_csv, path, compression='bz3') with open(path, 'rb') as fin: if compat.PY3: result = self.read_csv(fin, compression='bz2') tm.assert_frame_equal(result, expected) elif self.engine is not 'python': self.assertRaises(ValueError, self.read_csv, fin, compression='bz2') with tm.ensure_clean('test.bz2') as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def test_decompression_regex_sep(self): # see gh-6607 try: import gzip import bz2 except ImportError: raise nose.SkipTest("need gzip and bz2 to run") data = open(self.csv1, "rb").read() data = data.replace(b",", b"::") expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, sep="::", compression="gzip") tm.assert_frame_equal(result, expected) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode="wb") tmp.write(data) tmp.close() result = self.read_csv(path, sep="::", compression="bz2") tm.assert_frame_equal(result, expected) self.assertRaises(ValueError, self.read_csv, path, compression="bz3")
def test_xz(self): lzma = compat.import_lzma() with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = lzma.LZMAFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='xz') tm.assert_frame_equal(result, expected) with open(path, 'rb') as f: result = self.read_csv(f, compression='xz') tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.xz') as path: tmp = lzma.LZMAFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def test_gzip(self): import gzip with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='gzip') tm.assert_frame_equal(result, expected) with open(path, 'rb') as f: result = self.read_csv(f, compression='gzip') tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.gz') as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def test_bz2(self): import bz2 with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='bz2') tm.assert_frame_equal(result, expected) pytest.raises(ValueError, self.read_csv, path, compression='bz3') with open(path, 'rb') as fin: result = self.read_csv(fin, compression='bz2') tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.bz2') as path: tmp = bz2.BZ2File(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def test_roundtrip_indexlabels(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_indexlabels__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel( path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = '%s.%s' % (tm.rands(10), ext) with ensure_clean(path) as path: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def test_to_csv_dups_cols(self): df = DataFrame(np.random.randn(1000, 30), columns=lrange( 15) + lrange(15), dtype='float64') with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine result = read_csv(filename, index_col=0) result.columns = df.columns assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') df_bool = DataFrame(True, index=df_float.index, columns=lrange(3)) df_object = DataFrame('foo', index=df_float.index, columns=lrange(3)) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=lrange(3)) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True) cols = [] for i in range(5): cols.extend([0, 1, 2]) df.columns = cols from pandas import to_datetime with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) # date cols for i in ['0.4', '1.4', '2.4']: result[i] = to_datetime(result[i]) result.columns = df.columns assert_frame_equal(result, df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf N = 10 df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) result = result.rename(columns={'a.1': 'a'}) assert_frame_equal(result, df)
def test_zip(self): try: import zipfile except ImportError: raise nose.SkipTest('need zipfile to run') with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean('test_file.zip') as path: tmp = zipfile.ZipFile(path, mode='w') tmp.writestr('test_file', data) tmp.close() result = self.read_csv(path, compression='zip') tm.assert_frame_equal(result, expected) result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected) if self.engine is not 'python': with open(path, 'rb') as f: result = self.read_csv(f, compression='zip') tm.assert_frame_equal(result, expected) with tm.ensure_clean('combined_zip.zip') as path: inner_file_names = ['test_file', 'second_file'] tmp = zipfile.ZipFile(path, mode='w') for file_name in inner_file_names: tmp.writestr(file_name, data) tmp.close() self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, path, compression='zip') self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, path, compression='infer') with tm.ensure_clean() as path: tmp = zipfile.ZipFile(path, mode='w') tmp.close() self.assertRaisesRegexp(ValueError, 'Zero files', self.read_csv, path, compression='zip') with tm.ensure_clean() as path: with open(path, 'wb') as f: self.assertRaises(zipfile.BadZipfile, self.read_csv, f, compression='zip')
def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=compression_only) assert not fh.closed assert fh.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=None) assert not fh.closed assert fh.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed
def test_warnings_errors(self): global e e = np.zeros((2, 2, 2, 2, 2)) with ensure_clean() as path: with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') stash(path, verbose=False) vault = unstash(path, verbose=False) self.assertTrue(len(w) == 1) self.assertTrue(issubclass(w[-1].category, UnsupportedDimensionWarning)) e = np.zeros((2,)) with ensure_clean() as path: self.assertRaises(TypeError, stash, path, frame=[e], verbose=False) del e
def _check_extension_indexlabels(self, ext): path = "__tmp_to_excel_from_excel_indexlabels__." + ext with ensure_clean(path) as path: self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # test index_label frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test", "dummy", "dummy2"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label="test") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = "%s.xls" % tm.rands(10) with ensure_clean(path) as path: self.frame.to_excel(path, "test1", cols=["A", "B", "C", "D"], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(["A", "B"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def test_categorical_warnings_and_errors(self): # Warning for non-string labels # Error for labels too long original = pd.DataFrame.from_records( [['a' * 10000], ['b' * 10000], ['c' * 10000], ['d' * 10000]], columns=['Too_long']) original = pd.concat([original[col].astype('category') for col in original], axis=1) with tm.ensure_clean() as path: tm.assertRaises(ValueError, original.to_stata, path) original = pd.DataFrame.from_records( [['a'], ['b'], ['c'], ['d'], [1]], columns=['Too_long']) original = pd.concat([original[col].astype('category') for col in original], axis=1) with warnings.catch_warnings(record=True) as w: original.to_stata(path) tm.assert_equal(len(w), 1) # should get a warning for mixed content
def test_get_store(self): pytest.importorskip('tables') with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = pd.get_store(path) s.close()
def roundtrip(df, header=True, parser_hdr=0): with ensure_clean(self.ext) as path: df.to_excel(path, header=header, merge_cells=self.merge_cells) xf = pd.ExcelFile(path) res = xf.parse(xf.sheet_names[0], header=parser_hdr) return res
def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, compression=compression) roundtripped_df = pd.read_json(path, lines=True, compression=compression) assert_frame_equal(df, roundtripped_df)
def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt fig = plt.gcf() plt.clf() ax = fig.add_subplot(211) orig_ax = kwargs.pop('ax', plt.gca()) orig_axfreq = getattr(orig_ax, 'freq', None) ret = f(*args, **kwargs) assert(ret is not None) # do something more intelligent ax = kwargs.pop('ax', plt.gca()) if series is not None: dfreq = series.index.freq if isinstance(dfreq, DateOffset): dfreq = dfreq.rule_code if orig_axfreq is None: assert(ax.freq == dfreq) if freq is not None and orig_axfreq is None: assert(ax.freq == freq) ax = fig.add_subplot(212) try: kwargs['ax'] = ax ret = f(*args, **kwargs) assert(ret is not None) # do something more intelligent except Exception: pass with ensure_clean() as path: plt.savefig(path)
def test_read_empty_dta(self): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path,write_index=False) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2)
def test_excel_deprecated_options(self): with ensure_clean(self.ext) as path: with tm.assert_produces_warning(FutureWarning): self.frame.to_excel(path, 'test1', cols=['A', 'B']) with tm.assert_produces_warning(False): self.frame.to_excel(path, 'test1', columns=['A', 'B'])
def _check_excel_multiindex_dates(self, ext): path = '__tmp_to_excel_multiindex_dates__' + ext + '__.' + ext # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) with ensure_clean(path) as path: tsframe.to_excel(path, 'test1', index_label=['time', 'foo']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons, check_names=False) self.assertEquals(recons.index.names, ['time', 'foo']) # infer index tsframe.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(tsframe, recons) self.tsframe.index = old_index # needed if setUP becomes classmethod
def test_to_csv_mixed(self): def create_cols(name): return ["%s%03d" % (name, i) for i in range(5)] df_float = DataFrame(np.random.randn( 100, 5), dtype='float64', columns=create_cols('float')) df_int = DataFrame(np.random.randn(100, 5), dtype='int64', columns=create_cols('int')) df_bool = DataFrame(True, index=df_float.index, columns=create_cols('bool')) df_object = DataFrame('foo', index=df_float.index, columns=create_cols('object')) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=create_cols('date')) # add in some nans df_float.loc[30:50, 1:3] = np.nan # ## this is a bug in read_csv right now #### # df_dt.loc[30:50,1:3] = np.nan df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) # dtype dtypes = dict() for n, dtype in [('float', np.float64), ('int', np.int64), ('bool', np.bool), ('object', np.object)]: for c in create_cols(n): dtypes[c] = dtype with ensure_clean() as filename: df.to_csv(filename) rs = read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date')) assert_frame_equal(rs, df)
def test_to_csv_from_csv2(self): with ensure_clean("__tmp_to_csv_from_csv2__") as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"]) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) df.to_csv(path) result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): self.frame2.to_csv(path, header=["AA", "X"])
def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame({ "col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3) }) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") with tm.ensure_clean(ext) as pth: df2.to_excel(pth) res = pd.read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel(pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0) tm.assert_frame_equal(df, res)
def test_read_write_dta11(self): original = DataFrame([(1, 2, 3, 4)], columns=[ 'good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______' ]) formatted = DataFrame([(1, 2, 3, 4)], columns=[ 'good', 'b_d', '_8number', 'astringwithmorethan32characters_' ]) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def _check_extension_sheets(self, ext): path = '__tmp_to_excel_from_excel_sheets__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1])
def test_categorical_writing(self): original = DataFrame.from_records( [["one", "ten", "one", "one", "one", 1], ["two", "nine", "two", "two", "two", 2], ["three", "eight", "three", "three", "three", 3], ["four", "seven", 4, "four", "four", 4], ["five", "six", 5, np.nan, "five", 5], ["six", "five", 6, np.nan, "six", 6], ["seven", "four", 7, np.nan, "seven", 7], ["eight", "three", 8, np.nan, "eight", 8], ["nine", "two", 9, np.nan, "nine", 9], ["ten", "one", "ten", np.nan, "ten", 10]], columns=[ 'fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled', 'unlabeled' ]) expected = original.copy() # these are all categoricals original = pd.concat( [original[col].astype('category') for col in original], axis=1) expected['incompletely_labeled'] = expected[ 'incompletely_labeled'].apply(str) expected['unlabeled'] = expected['unlabeled'].apply(str) expected = pd.concat( [expected[col].astype('category') for col in expected], axis=1) expected.index.name = 'index' with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: # Silence warnings original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index('index'), expected)
def test_sheets(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1])
def test_to_csv_from_csv2(self): with ensure_clean('__tmp_to_csv_from_csv2__') as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples([('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): self.frame2.to_csv(path, header=['AA', 'X'])
def test_unsupported_dtype(self): df = DataFrame(np.random.rand(5, 2), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__unsupported_dtype__.csv') as path: df.to_csv(path) # valid but we don't support it (date) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0, parse_dates=['B']) # valid but we don't support it self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) # valid but unsupported - fixed width unicode string self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'U8'}, index_col=0)
def test_to_excel_multiindex_no_write_index(self): _skip_if_no_xlrd() # Test writing and re-reading a MI witout the index. GH 5616. # Initial non-MI frame. frame1 = pd.DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) # Add a MI. frame2 = frame1.copy() multi_index = pd.MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index with ensure_clean(self.ext) as path: # Write out to Excel without the index. frame2.to_excel(path, 'test1', index=False) # Read it back in. reader = ExcelFile(path) frame3 = reader.parse('test1') # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3)
def set_engine_and_path(self, request, engine, ext): """Fixture to set engine and open file for use in each test case Rather than requiring `engine=...` to be provided explicitly as an argument in each test, this fixture sets a global option to dictate which engine should be used to write Excel files. After executing the test it rolls back said change to the global option. It also uses a context manager to open a temporary excel file for the function to write to, accessible via `self.path` Notes ----- This fixture will run as part of each test method defined in the class and any subclasses, on account of the `autouse=True` argument """ option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) prev_engine = get_option(option_name) set_option(option_name, engine) with ensure_clean(ext) as path: self.path = path yield set_option(option_name, prev_engine) # Roll back option change
def _check_extension(self, ext): path = '__tmp_to_excel_from_excel__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test roundtrip self.frame.to_excel(path, 'test1') recons = read_excel(path, 'test1', index_col=0) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', index=False) recons = read_excel(path, 'test1', index_col=None) recons.index = self.frame.index tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', na_rep='NA') recons = read_excel(path, 'test1', index_col=0, na_values=['NA']) tm.assert_frame_equal(self.frame, recons) # GH 3611 self.frame.to_excel(path, 'test1', na_rep='88') recons = read_excel(path, 'test1', index_col=0, na_values=['88']) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', na_rep='88') recons = read_excel(path, 'test1', index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(self.frame, recons)
def test_path(self): with ensure_clean('test.json') as path: for df in [self.frame, self.frame2, self.intframe, self.tsframe, self.mixed_frame]: df.to_json(path) read_json(path)
def test_write_explicit_bad(self, compression, get_random_path): with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression)
def roundtrip(s, encoding='latin-1'): with ensure_clean('test.json') as path: s.to_json(path, encoding=encoding) retr = read_json(path, encoding=encoding) assert_series_equal(s, retr, check_categorical=False)
def test_stata_doc_examples(self): with tm.ensure_clean() as path: df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path)
def test_excelwriter_fspath(self): with tm.ensure_clean("foo.xlsx") as path: writer = ExcelWriter(path) assert os.fspath(writer) == str(path)
def test_roundtrip_indexlabels(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test'], merge_cells=self.merge_cells) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, has_index_names=self.merge_cells).astype( np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test', 'dummy', 'dummy2'], merge_cells=self.merge_cells) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, has_index_names=self.merge_cells).astype( np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test', merge_cells=self.merge_cells) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, has_index_names=self.merge_cells).astype( np.int64) frame.index.names = ['test'] self.assertAlmostEqual(frame.index.names, recons.index.names) with ensure_clean(self.ext) as path: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False, merge_cells=self.merge_cells) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index with ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUp becomes class method self.tsframe.index = old_index with ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ["first", "second"] return DataFrame( np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names ), dtype="int64", ) # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com._all_none(*result.columns.names) result.columns.names = df.columns.names assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = "len of {i}, but only 5 lines in file".format(i=i) with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): df.to_csv(path, columns=["foo", "bar"]) with ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0
def check_error_on_write(self, df, engine, exc): # check that we are raising the exception on writing with tm.ensure_clean() as path: with pytest.raises(exc): to_parquet(df, path, engine, compression=None)
def _pickle_roundtrip(self, obj): with ensure_clean() as path: obj.to_pickle(path) unpickled = pd.read_pickle(path) return unpickled
def _do_test( df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False ): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) with ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 with ensure_clean("__tmp_to_csv_moar__") as path: df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): return x.decode("utf8") return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1 :] type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: if r_dtype == "u": # unicode r_dtype = "O" recons.index = np.array( [_to_uni(label) for label in recons.index], dtype=r_dtype ) df.index = np.array( [_to_uni(label) for label in df.index], dtype=r_dtype ) elif r_dtype == "dt": # unicode r_dtype = "O" recons.index = np.array( [Timestamp(label) for label in recons.index], dtype=r_dtype ) df.index = np.array( [Timestamp(label) for label in df.index], dtype=r_dtype ) elif r_dtype == "p": r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( [Timestamp(label) for label in idx_list], dtype=r_dtype ) df.index = np.array( list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype ) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == "u": c_dtype = "O" recons.columns = np.array( [_to_uni(label) for label in recons.columns], dtype=c_dtype ) df.columns = np.array( [_to_uni(label) for label in df.columns], dtype=c_dtype ) elif c_dtype == "dt": c_dtype = "O" recons.columns = np.array( [Timestamp(label) for label in recons.columns], dtype=c_dtype ) df.columns = np.array( [Timestamp(label) for label in df.columns], dtype=c_dtype ) elif c_dtype == "p": c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype ) col_list = df.columns.to_timestamp() df.columns = np.array( [Timestamp(label) for label in col_list], dtype=c_dtype ) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
def test_read(self, protocol, get_random_path): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, protocol=protocol) df2 = pd.read_pickle(path) tm.assert_frame_equal(df, df2)
def path(ext): """ Fixture to open file for use in each test case. """ with tm.ensure_clean(ext) as file_path: yield file_path
def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[ :, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array( lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( list(map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array( list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( lmap(Timestamp, to_datetime(recons.columns)), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
def test_styler_to_excel(engine): def style(df): # XXX: RGB colors not supported in xlwt return DataFrame( [['font-weight: bold', '', ''], ['', 'color: blue', ''], ['', '', 'text-decoration: underline'], ['border-style: solid', '', ''], ['', 'font-style: italic', ''], ['', '', 'text-align: right'], ['background-color: red', '', ''], ['number-format: 0%', '', ''], ['', '', ''], ['', '', ''], ['', '', '']], index=df.index, columns=df.columns) def assert_equal_style(cell1, cell2, engine): if engine in ['xlsxwriter', 'openpyxl']: pytest.xfail(reason=("GH25351: failing on some attribute " "comparisons in {}".format(engine))) # XXX: should find a better way to check equality assert cell1.alignment.__dict__ == cell2.alignment.__dict__ assert cell1.border.__dict__ == cell2.border.__dict__ assert cell1.fill.__dict__ == cell2.fill.__dict__ assert cell1.font.__dict__ == cell2.font.__dict__ assert cell1.number_format == cell2.number_format assert cell1.protection.__dict__ == cell2.protection.__dict__ def custom_converter(css): # use bold iff there is custom style attached to the cell if css.strip(' \n;'): return {'font': {'bold': True}} return {} pytest.importorskip('jinja2') pytest.importorskip(engine) # Prepare spreadsheets df = DataFrame(np.random.randn(11, 3)) with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path: writer = ExcelWriter(path, engine=engine) df.to_excel(writer, sheet_name='frame') df.style.to_excel(writer, sheet_name='unstyled') styled = df.style.apply(style, axis=None) styled.to_excel(writer, sheet_name='styled') ExcelFormatter(styled, style_converter=custom_converter).write( writer, sheet_name='custom') writer.save() if engine not in ('openpyxl', 'xlsxwriter'): # For other engines, we only smoke test return openpyxl = pytest.importorskip('openpyxl') wb = openpyxl.load_workbook(path) # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled n_cells = 0 for col1, col2 in zip(wb['frame'].columns, wb['unstyled'].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): assert cell1.value == cell2.value assert_equal_style(cell1, cell2, engine) n_cells += 1 # ensure iteration actually happened: assert n_cells == (11 + 1) * (3 + 1) # (2) check styling with default converter # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF alpha = '00' if engine == 'openpyxl' else 'FF' n_cells = 0 for col1, col2 in zip(wb['frame'].columns, wb['styled'].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): ref = '%s%d' % (cell2.column, cell2.row) # XXX: this isn't as strong a test as ideal; we should # confirm that differences are exclusive if ref == 'B2': assert not cell1.font.bold assert cell2.font.bold elif ref == 'C3': assert cell1.font.color.rgb != cell2.font.color.rgb assert cell2.font.color.rgb == alpha + '0000FF' elif ref == 'D4': # This fails with engine=xlsxwriter due to # https://bitbucket.org/openpyxl/openpyxl/issues/800 if engine == 'xlsxwriter' \ and (LooseVersion(openpyxl.__version__) < LooseVersion('2.4.6')): pass else: assert cell1.font.underline != cell2.font.underline assert cell2.font.underline == 'single' elif ref == 'B5': assert not cell1.border.left.style assert (cell2.border.top.style == cell2.border.right.style == cell2.border.bottom.style == cell2.border.left.style == 'medium') elif ref == 'C6': assert not cell1.font.italic assert cell2.font.italic elif ref == 'D7': assert (cell1.alignment.horizontal != cell2.alignment.horizontal) assert cell2.alignment.horizontal == 'right' elif ref == 'B8': assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb assert cell1.fill.patternType != cell2.fill.patternType assert cell2.fill.fgColor.rgb == alpha + 'FF0000' assert cell2.fill.patternType == 'solid' elif ref == 'B9': assert cell1.number_format == 'General' assert cell2.number_format == '0%' else: assert_equal_style(cell1, cell2, engine) assert cell1.value == cell2.value n_cells += 1 assert n_cells == (11 + 1) * (3 + 1) # (3) check styling with custom converter n_cells = 0 for col1, col2 in zip(wb['frame'].columns, wb['custom'].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): ref = '%s%d' % (cell2.column, cell2.row) if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'): assert not cell1.font.bold assert cell2.font.bold else: assert_equal_style(cell1, cell2, engine) assert cell1.value == cell2.value n_cells += 1 assert n_cells == (11 + 1) * (3 + 1)
def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with tm.ensure_clean(ext) as path: df = DataFrame({ ("One", "x"): { 0: 1 }, ("Two", "X"): { 0: 3 }, ("Two", "Y"): { 0: 7 }, ("Zero", ""): { 0: 0 }, }) expected = DataFrame({ ("One", "x"): { 0: 1 }, ("Two", "X"): { 0: 3 }, ("Two", "Y"): { 0: 7 }, ("Zero", "Unnamed: 4_level_1"): { 0: 0 }, }) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) df = pd.DataFrame({ ("Beg", ""): { 0: 0 }, ("Middle", "x"): { 0: 1 }, ("Tail", "X"): { 0: 3 }, ("Tail", "Y"): { 0: 7 }, }) expected = pd.DataFrame({ ("Beg", "Unnamed: 1_level_1"): { 0: 0 }, ("Middle", "x"): { 0: 1 }, ("Tail", "X"): { 0: 3 }, ("Tail", "Y"): { 0: 7 }, }) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) self.assertEqual(frame.index.names, df.index.names) # needed if setUP becomes a classmethod self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = DataFrame.from_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = DataFrame.from_csv(path, index_col=None) self.assertEqual(len(recons.columns), len(tsframe.columns) + 2) # no index tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUP becomes classmethod self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=0, tupleize_cols=False) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2], tupleize_cols=False) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, tupleize_cols=False, index=False) result = read_csv(path, header=[0, 1], tupleize_cols=False) self.assertTrue(all([x is None for x in result.columns.names])) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) df.to_csv(path, tupleize_cols=True, index=False) result = read_csv(path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path, tupleize_cols=False) result = read_csv(path, header=[0, 1], index_col=[0], tupleize_cols=False) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path, tupleize_cols=True) result = read_csv(path, header=0, index_col=[0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path, tupleize_cols=False) # catch invalid headers with assertRaisesRegexp( CParserError, 'Passed header=\[0,1,2\] are too many ' 'rows for this multi_index of columns'): read_csv(path, tupleize_cols=False, header=lrange(3), index_col=0) with assertRaisesRegexp( CParserError, 'Passed header=\[0,1,2,3,4,5,6\], len of ' '7, but only 6 lines in file'): read_csv(path, tupleize_cols=False, header=lrange(7), index_col=0) for i in [4, 5, 6]: with tm.assertRaises(CParserError): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) # write with cols with assertRaisesRegexp(TypeError, 'cannot specify cols with a ' 'MultiIndex'): df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = DataFrame.from_csv(path) exp = tsframe[:0] exp.index = [] self.assert_index_equal(recons.columns, exp.columns) self.assertEqual(len(recons), 0)
def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUp becomes class method self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv( path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com._all_none(*result.columns.names) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) with tm.assert_produces_warning(FutureWarning): df.to_csv(path, tupleize_cols=True, index=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) with tm.assert_produces_warning(FutureWarning): df.to_csv(path, tupleize_cols=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, index_col=[0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with tm.assert_raises_regex(ParserError, msg): read_csv(path, header=lrange(i), index_col=0) # write with cols with tm.assert_raises_regex(TypeError, 'cannot specify cols ' 'with a MultiIndex'): df.to_csv(path, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0