def test_excel_cell_error_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_sheets(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_sheets__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1])
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, "test2.xls")) parsed = excel_data.parse("Sheet1") expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected)
def test_to_excel_unicode_filename(self): _skip_if_no_xlrd() ext = self.ext filename = u('\u0192u.') + ext try: f = open(filename, 'wb') except UnicodeEncodeError: raise nose.SkipTest('no unicode file names on this system') else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean(filename) as filename: df.to_excel(filename, 'test1', float_format='%.2f') reader = ExcelFile(filename) rs = reader.parse('test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp)
def test_to_excel_multiindex_dates(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_multiindex_dates__' + ext + '__.' + ext # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) with ensure_clean(path) as path: tsframe.to_excel(path, 'test1', index_label=['time', 'foo']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons, check_names=False) self.assertEquals(recons.index.names, ('time', 'foo')) # infer index tsframe.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(tsframe, recons) self.tsframe.index = old_index # needed if setUP becomes classmethod
def test_to_excel_unicode_filename(self): _skip_if_no_excelsuite() for ext in ["xls", "xlsx"]: filename = u"\u0192u." + ext try: f = open(filename, "wb") except UnicodeEncodeError: raise nose.SkipTest("no unicode file names on this system") else: f.close() df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) with ensure_clean(filename) as filename: df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) rs = reader.parse("test1", index_col=None) xp = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"] ) tm.assert_frame_equal(rs, xp)
def test_excel_table_sheet_by_index(self, read_ext, df_ref): excel = ExcelFile('test1' + read_ext) df1 = pd.read_excel(excel, 0, index_col=0) df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) tm.assert_frame_equal(df3, df4) df3 = excel.parse(0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) import xlrd # will move to engine-specific tests as new ones are added with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, 'asdf')
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected)
def _check_extension_mixed(self, ext): path = "__tmp_to_excel_from_excel_mixed__." + ext with ensure_clean(path) as path: self.mixed_frame.to_excel(path, "test1") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0) tm.assert_frame_equal(self.mixed_frame, recons)
def test_mixed(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as path: self.mixed_frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.mixed_frame, recons)
def test_inf_roundtrip(self): _skip_if_no_xlrd() frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) with ensure_clean(self.ext) as path: frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(frame, recons)
def _check_extension_tsframe(self, ext): path = "__tmp_to_excel_from_excel_tsframe__." + ext df = tm.makeTimeDataFrame()[:5] with ensure_clean(path) as path: df.to_excel(path, "test1") reader = ExcelFile(path) recons = reader.parse("test1") tm.assert_frame_equal(df, recons)
def test_tsframe(self): _skip_if_no_xlrd() df = tm.makeTimeDataFrame()[:5] with ensure_clean(self.ext) as path: df.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(df, recons)
class ExcelExtractor(Extractor): ''' An extractor for excel files. One sheet only for now. Expects column names in first row, rest of rows mapped 1:1 to incoming table rows. Unique identifier (or unique for domain) in first column. ''' def __init__(self, incoming_table_class, file_name): ''' Constructor ''' self._incoming_table_class = incoming_table_class self.file_name = file_name self.workbook = ExcelFile(os.path.join(conf.INPUT_DIR, file_name)) super(ExcelExtractor, self).__init__(self._incoming_table_class) def _get_workbook_rowdicts(self): ''' returns list of key-value dicts for all rows in sheet, with keys in first row. empty values are removed. ''' rows = self.workbook.parse().to_dict(outtype='records') rows_ret = list() for row in rows: ret = dict((k, v) for k, v in row.iteritems() if notnull(v)) rows_ret.append(ret) return rows_ret def _get_workbook_keys(self): ''' returns list of key-value dicts from keys in first row ''' return self.workbook.parse().to_dict().keys() def do_extract(self): db_col_keys = [k for k in self._get_workbook_keys() if k in self._get_db_cols] hstore_keys = [h for h in self._get_workbook_keys() if h not in self._get_db_cols] for row in self._get_workbook_rowdicts(): db_col_dict = dict((k, v) for k, v in row.iteritems() if k in db_col_keys) hstore_col_dict = dict((k, unicode(v)) for k, v in row.iteritems() if k in hstore_keys) insert_dict = db_col_dict insert_dict[self._get_hstore_db_col] = hstore_col_dict self._incoming_table_class.create(**insert_dict) def do_cleanup(self): delete_q = self._incoming_table_class.delete() rows = delete_q.execute() logger.info('Deleted %d records in incoming data table %s' % (rows, self._incoming_table_class._meta.db_table))
def test_excel_passes_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, "test2.xlsx")) parsed = excel_data.parse("Sheet1", keep_default_na=False, na_values=["apple"]) expected = DataFrame([["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) parsed = excel_data.parse("Sheet1", keep_default_na=True, na_values=["apple"]) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]) tm.assert_frame_equal(parsed, expected)
def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" excel = ExcelFile(filename + read_ext) df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
def test_float_types(self): _skip_if_no_xlrd() for np_type in (np.float16, np.float32, np.float64): with ensure_clean(self.ext) as path: # Test np.float values read come back as float. frame = DataFrame(np.random.random_sample(10), dtype=np_type) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np_type) tm.assert_frame_equal(frame, recons, check_dtype=False)
def test_bool_types(self): _skip_if_no_xlrd() for np_type in (np.bool8, np.bool_): with ensure_clean(self.ext) as path: # Test np.bool values read come back as float. frame = (DataFrame([1, 0, True, False], dtype=np_type)) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np_type) tm.assert_frame_equal(frame, recons)
def test_to_excel_periodindex(self): _skip_if_no_xlrd() frame = self.tsframe xp = frame.resample('M', kind='period') with ensure_clean(self.ext) as path: xp.to_excel(path, 'sht1') reader = ExcelFile(path) rs = reader.parse('sht1', index_col=0, parse_dates=True) tm.assert_frame_equal(xp, rs.to_period('M'))
def check_excel_sheet_by_name_raise(self, ext): import xlrd pth = os.path.join(self.dirpath, 'testit.{0}'.format(ext)) with ensure_clean(pth) as pth: gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(pth) xl = ExcelFile(pth) df = xl.parse(0) tm.assert_frame_equal(gt, df) self.assertRaises(xlrd.XLRDError, xl.parse, '0')
def test_excel_sheet_by_name_raise(self): _skip_if_no_xlrd() import xlrd with ensure_clean(self.ext) as pth: gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(pth) xl = ExcelFile(pth) df = xl.parse(0) tm.assert_frame_equal(gt, df) self.assertRaises(xlrd.XLRDError, xl.parse, '0')
def test_excel_roundtrip_datetime(self): _skip_if_no_xlrd() # datetime.date, not sure what to test here exactly tsf = self.tsframe.copy() with ensure_clean(self.ext) as path: tsf.index = [x.date() for x in self.tsframe.index] tsf.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(self.tsframe, recons)
def test_stringio_writer(self): _skip_if_no_xlsxwriter() _skip_if_no_xlrd() path = BytesIO() with ExcelWriter(path, engine='xlsxwriter', **{'options': {'in-memory': True}}) as ew: self.frame.to_excel(ew, 'test1', engine='xlsxwriter') ew.save() path.seek(0) ef = ExcelFile(path) found_df = ef.parse('test1') tm.assert_frame_equal(self.frame, found_df) path.close()
def test_excel_read_buffer(self): _skip_if_no_xlrd() _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, "test.xls") f = open(pth, "rb") xls = ExcelFile(f) # it works xls.parse("Sheet1", index_col=0, parse_dates=True) pth = os.path.join(self.dirpath, "test.xlsx") f = open(pth, "rb") xl = ExcelFile(f) df = xl.parse("Sheet1", index_col=0, parse_dates=True)
def test_to_excel_periodindex(self): _skip_if_no_excelsuite() for ext in ["xls", "xlsx"]: path = "__tmp_to_excel_periodindex__." + ext frame = self.tsframe xp = frame.resample("M", kind="period") with ensure_clean(path) as path: xp.to_excel(path, "sht1") reader = ExcelFile(path) rs = reader.parse("sht1", index_col=0, parse_dates=True) tm.assert_frame_equal(xp, rs.to_period("M"))
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' with ensure_clean(self.ext) as path: df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo')
def test_to_excel_periodindex(self): _skip_if_no_excelsuite() for ext in ['xls', 'xlsx']: path = '__tmp_to_excel_periodindex__.' + ext frame = self.tsframe xp = frame.resample('M', kind='period') with ensure_clean(path) as path: xp.to_excel(path, 'sht1') reader = ExcelFile(path) rs = reader.parse('sht1', index_col=0, parse_dates=True) tm.assert_frame_equal(xp, rs.to_period('M'))
def test_excel_roundtrip_datetime(self): _skip_if_no_xlrd() _skip_if_no_xlwt() # datetime.date, not sure what to test here exactly path = "__tmp_excel_roundtrip_datetime__.xls" tsf = self.tsframe.copy() with ensure_clean(path) as path: tsf.index = [x.date() for x in self.tsframe.index] tsf.to_excel(path, "test1") reader = ExcelFile(path) recons = reader.parse("test1") tm.assert_frame_equal(self.tsframe, recons)
def test_parse_cols_int(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ["", "x"] for s in suffix: pth = os.path.join(self.dirpath, "test.xls%s" % s) xls = ExcelFile(pth) df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols=3) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=["A", "B", "C"]) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols=3) tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xls file) tm.assert_frame_equal(df3, df2, check_names=False)
def test_excel_passes_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xlsx')) parsed = excel_data.parse('Sheet1', keep_default_na=False, na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = excel_data.parse('Sheet1', keep_default_na=True, na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_float_types(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_float_types__.' + ext for np_type in (np.float16, np.float32, np.float64): with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test np.float values read come back as float. frame = DataFrame(np.random.random_sample(10), dtype=np_type) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np_type) tm.assert_frame_equal(frame, recons, check_dtype=False)
def test_bool_types(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_bool_types__.' + ext for np_type in (np.bool8, np.bool_): with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test np.bool values read come back as float. frame = (DataFrame([1, 0, True, False], dtype=np_type)) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np_type) tm.assert_frame_equal(frame, recons)
def test_ts_frame(self, tsframe, path): df = tsframe # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(df.index), freq=None) df.index = index df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons)
def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np_type ) tm.assert_frame_equal(df, recons)
def _check_excel_multiindex(self, ext): path = '__tmp_to_excel_multiindex__' + ext + '__.' + ext frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean(path) as path: frame.to_excel(path, 'test1', header=False) frame.to_excel(path, 'test1', cols=['A', 'B']) # round trip frame.to_excel(path, 'test1') reader = ExcelFile(path) df = reader.parse('test1', index_col=[0, 1], parse_dates=False) tm.assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names) self.frame.index = old_index # needed if setUP becomes a classmethod
def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) df.index.name = "foo" df.to_excel(path, merge_cells=merge_cells) xf = ExcelFile(path) result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == "foo"
def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): # datetime.date, not sure what to test here exactly tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] tsf.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(tsframe, recons)
def test_bool_types(self, np_type, path): # Test np.bool8 and np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np_type ) tm.assert_frame_equal(df, recons)
def test_reader_closes_file(self): _skip_if_no_xlrd() _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xlsx') f = open(pth, 'rb') with ExcelFile(f) as xlsx: # parses okay df = xlsx.parse('Sheet1', index_col=0) self.assertTrue(f.closed)
def test_to_excel_multiindex(self): _skip_if_no_xlrd() frame = self.frame arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean(self.ext) as path: frame.to_excel(path, 'test1', header=False) frame.to_excel(path, 'test1', cols=['A', 'B']) # round trip frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = reader.parse('test1', index_col=[0, 1], parse_dates=False, has_index_names=self.merge_cells) tm.assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names)
def test_excel_sheet_by_name_raise(self, path, engine): gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) with ExcelFile(path) as xl: df = pd.read_excel(xl, sheet_name=0, index_col=0) tm.assert_frame_equal(gt, df) msg = "Worksheet named '0' not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0")
def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ["time", "foo"] tsframe.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ("time", "foo")
def test_to_excel_empty_multiindex(self, path): # GH 19543. expected = DataFrame([], columns=[0, 1, 2]) df = DataFrame([], index=MultiIndex.from_tuples([], names=[0, 1]), columns=[2]) df.to_excel(path, "test1") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1") tm.assert_frame_equal( result, expected, check_index_type=False, check_dtype=False )
def test_colaliases(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_aliases__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_excel(path, 'test1', header=col_aliases) reader = ExcelFile(path) rs = reader.parse('test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs)
def test_excel_read_buffer(self): _skip_if_no_xlrd() _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xls') f = open(pth, 'rb') xls = ExcelFile(f) # it works xls.parse('Sheet1', index_col=0, parse_dates=True) pth = os.path.join(self.dirpath, 'test.xlsx') f = open(pth, 'rb') xl = ExcelFile(f) xl.parse('Sheet1', index_col=0, parse_dates=True)
def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index frame.to_excel(self.path, 'test1', header=False) frame.to_excel(self.path, 'test1', columns=['A', 'B']) # round trip frame.to_excel(self.path, 'test1', merge_cells=merge_cells) reader = ExcelFile(self.path) df = pd.read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(frame, df)
def test_int_types(self): _skip_if_no_xlrd() for np_type in (np.int8, np.int16, np.int32, np.int64): with ensure_clean(self.ext) as path: # Test np.int values read come back as int (rather than float # which is Excel's format). frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') int_frame = frame.astype(int) tm.assert_frame_equal(int_frame, recons) recons2 = read_excel(path, 'test1') tm.assert_frame_equal(int_frame, recons2) # test with convert_float=False comes back as float float_frame = frame.astype(float) recons = read_excel(path, 'test1', convert_float=False) tm.assert_frame_equal(recons, float_frame)
def test_excelwriter_contextmanager(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as pth: with ExcelWriter(pth) as writer: self.frame.to_excel(writer, 'Data1') self.frame2.to_excel(writer, 'Data2') with ExcelFile(pth) as reader: found_df = reader.parse('Data1') found_df2 = reader.parse('Data2') tm.assert_frame_equal(found_df, self.frame) tm.assert_frame_equal(found_df2, self.frame2)
def test_to_excel_multiindex(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", columns=["A", "B"]) # round trip frame.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) df = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df)
def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: frame.to_excel(writer, "Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] frame2.to_excel(writer, "Data2") with ExcelFile(path) as reader: found_df = pd.read_excel(reader, "Data1", index_col=0) found_df2 = pd.read_excel(reader, "Data2", index_col=0) tm.assert_frame_equal(found_df, frame) tm.assert_frame_equal(found_df2, frame2)
def test_excel_sheet_by_name_raise(self, path): import xlrd gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) xl = ExcelFile(path) df = pd.read_excel(xl, 0, index_col=0) tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): pd.read_excel(xl, "0")
def test_to_excel_multiindex_dates(self, merge_cells, engine, ext, tsframe): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ['time', 'foo'] tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ('time', 'foo')
def test_to_excel_unicode_filename(self): _skip_if_no_xlrd() with ensure_clean(u('\u0192u.') + self.ext) as filename: try: f = open(filename, 'wb') except UnicodeEncodeError: raise nose.SkipTest('no unicode file names on this system') else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) df.to_excel(filename, 'test1', float_format='%.2f') reader = ExcelFile(filename) rs = reader.parse('test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp)
def test_to_excel_float_format(self, engine, ext): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"]) df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"]) tm.assert_frame_equal(result, expected)
def test_excel_date_datetime_format(self, engine, ext): # see gh-4133 # # Excel output format strings df = DataFrame([[date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], index=["DATE", "DATETIME"], columns=["X", "Y"]) df_expected = DataFrame([[datetime(2014, 1, 31), datetime(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], index=["DATE", "DATETIME"], columns=["X", "Y"]) with ensure_clean(ext) as filename2: writer1 = ExcelWriter(self.path) writer2 = ExcelWriter(filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS") df.to_excel(writer1, "test1") df.to_excel(writer2, "test1") writer1.close() writer2.close() reader1 = ExcelFile(self.path) reader2 = ExcelFile(filename2) rs1 = pd.read_excel(reader1, "test1", index_col=0) rs2 = pd.read_excel(reader2, "test1", index_col=0) tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected)
def test_excel_passes_na(self, read_ext): excel = ExcelFile('test4' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) # 13967 excel = ExcelFile('test5' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_excel_file_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") has_openpyxl = import_optional_dependency("openpyxl", errors="ignore") is not None if not has_openpyxl: with tm.assert_produces_warning( FutureWarning, raise_on_extra_warnings=False, match="The xlrd engine is no longer maintained", ): ExcelFile(path, engine=None) else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None)
def test_to_excel_interval_no_labels(self, path): # see gh-19242 # # Test writing Interval without labels. df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons)
def test_parse_cols_int(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ['', 'x'] for s in suffix: pth = os.path.join(self.dirpath, 'test.xls%s' % s) xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols=3) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['A', 'B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols=3) # TODO add index to xls file) tm.assert_frame_equal(df, df2, check_names=False) tm.assert_frame_equal(df3, df2, check_names=False)