def _check_extension_sheets(self, ext): path = '__tmp_to_excel_from_excel_sheets__.' + ext self.frame['A'][:5] = nan self.frame.to_excel(path,'test1') self.frame.to_excel(path,'test1', cols=['A', 'B']) self.frame.to_excel(path,'test1', header=False) self.frame.to_excel(path,'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer,'test1') self.tsframe.to_excel(writer,'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1',index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2',index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) os.remove(path)
def _check_extension(self, ext): path = '__tmp_to_excel_from_excel__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test roundtrip self.frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', index=False) reader = ExcelFile(path) recons = reader.parse('test1', index_col=None) recons.index = self.frame.index tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, 'test1', na_rep='NA') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, na_values=['NA']) tm.assert_frame_equal(self.frame, recons)
def _check_extension_sheets(self, ext): path = "__tmp_to_excel_from_excel_sheets__." + ext self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, "test1") self.tsframe.to_excel(writer, "test2") writer.save() reader = ExcelFile(path) recons = reader.parse("test1", index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse("test2", index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal("test1", reader.sheet_names[0]) np.testing.assert_equal("test2", reader.sheet_names[1]) os.remove(path)
def _check_excel_multiindex_dates(self, ext): path = '__tmp_to_excel_multiindex_dates__' + ext + '__.' + ext # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_excel(path, 'test1', index_label=['time', 'foo']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons, check_names=False) self.assertEquals(recons.index.names, ['time', 'foo']) # infer index tsframe.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(tsframe, recons) self.tsframe.index = old_index # needed if setUP becomes classmethod os.remove(path)
def _check_extension(self, ext): path = "__tmp_to_excel_from_excel__." + ext self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # test roundtrip self.frame.to_excel(path, "test1") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, "test1", index=False) reader = ExcelFile(path) recons = reader.parse("test1", index_col=None) recons.index = self.frame.index tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(path, "test1", na_rep="NA") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(self.frame, recons) os.remove(path)
def test_excel_table(self): pth = os.path.join(self.dirpath, 'test.xls') xls = ExcelFile(pth) df = xls.parse('Sheet1') df2 = read_csv(self.csv1) df3 = xls.parse('Sheet2', skiprows=[1]) assert_frame_equal(df, df2) assert_frame_equal(df3, df2)
def test_xlsx_table(self): _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xlsx') xlsx = ExcelFile(pth) df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2)
def test_xlsx_table(self): try: import openpyxl except ImportError: raise nose.SkipTest('openpyxl not installed, skipping') pth = os.path.join(self.dirpath, 'test.xlsx') xlsx = ExcelFile(pth) df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2)
def test_excel_table(self): try: import xlrd except ImportError: raise nose.SkipTest("xlrd not installed, skipping") pth = os.path.join(self.dirpath, "test.xls") xls = ExcelFile(pth) df = xls.parse("Sheet1", index_col=0, parse_dates=True) df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2)
def test_excel_read_buffer(self): _skip_if_no_xlrd() _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xls') f = open(pth, 'rb') xls = ExcelFile(f) # it works xls.parse('Sheet1', index_col=0, parse_dates=True) pth = os.path.join(self.dirpath, 'test.xlsx') f = open(pth, 'rb') xl = ExcelFile(f) df = xl.parse('Sheet1', index_col=0, parse_dates=True)
def _check_extension_indexlabels(self, ext): path = '__tmp_to_excel_from_excel_indexlabels__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel( path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = '%s.xls' % tm.rands(10) with ensure_clean(path) as path: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def test_parse_cols_int(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ["", "x"] for s in suffix: pth = os.path.join(self.dirpath, "test.xls%s" % s) xls = ExcelFile(pth) df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols=3) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=["A", "B", "C"]) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols=3) tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2)
def test_excel_table(self): _skip_if_no_xlrd() pth = os.path.join(self.dirpath, "test.xls") xls = ExcelFile(pth) df = xls.parse("Sheet1", index_col=0, parse_dates=True) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True) tm.assert_frame_equal(df, df2, check_names=False) tm.assert_frame_equal(df3, df2, check_names=False) df4 = xls.parse("Sheet1", index_col=0, parse_dates=True, skipfooter=1) df5 = xls.parse("Sheet1", index_col=0, parse_dates=True, skip_footer=1) tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5)
def test_xlsx_table(self): _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, "test.xlsx") xlsx = ExcelFile(pth) df = xlsx.parse("Sheet1", index_col=0, parse_dates=True) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xlsx.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True) tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2) df4 = xlsx.parse("Sheet1", index_col=0, parse_dates=True, skipfooter=1) df5 = xlsx.parse("Sheet1", index_col=0, parse_dates=True, skip_footer=1) tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5)
def _check_extension_indexlabels(self, ext): path = "__tmp_to_excel_from_excel_indexlabels__." + ext try: self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # test index_label frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test", "dummy", "dummy2"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label="test") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) finally: os.remove(path) # test index_labels in same row as column names path = "%s.xls" % tm.rands(10) try: self.frame.to_excel(path, "test1", cols=["A", "B", "C", "D"], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(["A", "B"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons) finally: os.remove(path)
def getExcelChunck(file, ws, drange, rid=-1, cid=-1): xls = ExcelFile(file) df = xls.parse(ws) # get the range from expression # B4H4 m = re.search('([A-Z]+)([0-9]+)([A-Z]+)([0-9]+)', drange) c1 = sord(m.group(1)) c2 = sord(m.group(3)) + 1 r1 = int(m.group(2)) - 2 r2 = int(m.group(4)) - 2 df2 = df.ix[r1:r2, c1:c2] if (rid >= 0): rh = int(rid) - 2 df2.columns = df.ix[rh, c1:c2] df2.columns = df2.columns.map( lambda x: str(x).strip().replace('.0', '')) if (cid >= 0): ch = sord(cid) df2.index = df.ix[r1:r2, ch] df2.index = df2.index.map(lambda x: str(x).strip().replace('.0', '')) return (df2)
def test_excel_cell_error_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_to_excel_unicode_filename(self): _skip_if_no_excelsuite() for ext in ['xls', 'xlsx']: filename = u'\u0192u.' + ext try: f = open(filename, 'wb') except UnicodeEncodeError: raise nose.SkipTest('no unicode file names on this system') else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) df.to_excel(filename, 'test1', float_format='%.2f') reader = ExcelFile(filename) rs = reader.parse('test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp) os.remove(filename)
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected)
def test_parse_cols_int(self): _skip_if_no_openpyxl() suffix = ['', 'x'] for s in suffix: pth = os.path.join(self.dirpath, 'test.xls%s' % s) xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols=3) df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['A', 'B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols=3) assert_frame_equal(df, df2) assert_frame_equal(df3, df2)
def test_excel_cell_error_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, "test3.xls")) parsed = excel_data.parse("Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected)
def test_to_excel_unicode_filename(self): _skip_if_no_excelsuite() for ext in ["xls", "xlsx"]: filename = u"\u0192u." + ext try: f = open(filename, "wb") except UnicodeEncodeError: raise nose.SkipTest("no unicode file names on this system") else: f.close() df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) rs = reader.parse("test1", index_col=None) xp = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"] ) tm.assert_frame_equal(rs, xp) os.remove(filename)
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa','bbbbb']], columns=['Test', 'Test1']) assert_frame_equal(parsed, expected)
def test_excel_cell_error_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) assert_frame_equal(parsed, expected)
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, "test2.xls")) parsed = excel_data.parse("Sheet1") expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected)
def test_to_excel_unicode_filename(self): _skip_if_no_excelsuite() for ext in ['xls', 'xlsx']: filename = '\u0192u.' + ext try: f = open(filename, 'wb') except UnicodeEncodeError: raise nose.SkipTest('no unicode file names on this system') else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean(filename) as filename: df.to_excel(filename, 'test1', float_format='%.2f') reader = ExcelFile(filename) rs = reader.parse('test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp)
def _check_extension_mixed(self, ext): path = '__tmp_to_excel_from_excel_mixed__.' + ext with ensure_clean(path) as path: self.mixed_frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.mixed_frame, recons)
def test_parse_cols_int(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ['', 'x'] for s in suffix: pth = os.path.join(self.dirpath, 'test.xls%s' % s) xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols=3) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['A', 'B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols=3) tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xls file) tm.assert_frame_equal(df3, df2, check_names=False)
def test_excel_table(self): _skip_if_no_xlrd() pth = os.path.join(self.dirpath, 'test.xls') xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) tm.assert_frame_equal(df, df2, check_names=False) tm.assert_frame_equal(df3, df2, check_names=False) df4 = xls.parse('Sheet1', index_col=0, parse_dates=True, skipfooter=1) df5 = xls.parse('Sheet1', index_col=0, parse_dates=True, skip_footer=1) tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5)
def test_xlsx_table(self): _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xlsx') xlsx = ExcelFile(pth) df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xlsx file tm.assert_frame_equal(df3, df2, check_names=False) df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, skipfooter=1) df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, skip_footer=1) tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5)
def _check_extension_mixed(self, ext): path = '__tmp_to_excel_from_excel_mixed__.' + ext self.mixed_frame.to_excel(path,'test1') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, has_index_names=True) tm.assert_frame_equal(self.mixed_frame, recons) os.remove(path)
def test_xlsx_table(self): _skip_if_no_openpyxl() pth = os.path.join(self.dirpath, 'test.xlsx') xlsx = ExcelFile(pth) df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2) df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, skipfooter=1) df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, skip_footer=1) tm.assert_frame_equal(df4, df.ix[:-1]) tm.assert_frame_equal(df4, df5)
def _check_extension_mixed(self, ext): path = '__tmp_to_excel_from_excel_mixed__.' + ext self.mixed_frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0, has_index_names=True) tm.assert_frame_equal(self.mixed_frame, recons) os.remove(path)
def _check_extension_mixed(self, ext): path = "__tmp_to_excel_from_excel_mixed__." + ext self.mixed_frame.to_excel(path, "test1") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0) tm.assert_frame_equal(self.mixed_frame, recons) os.remove(path)
def test_parse_cols_list(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ['', 'x'] for s in suffix: pth = os.path.join(self.dirpath, 'test.xls%s' % s) xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols=[0, 2, 3]) df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols=[0, 2, 3]) tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xls file tm.assert_frame_equal(df3, df2, check_names=False)
def test_excel_cell_error_na(self): try: import xlrd except ImportError: raise nose.SkipTest('xlrd not installed, skipping') excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) assert_frame_equal(parsed, expected)
def test_excel_stop_iterator(self): try: import xlrd except ImportError: raise nose.SkipTest("xlrd not installed, skipping") excel_data = ExcelFile(os.path.join(self.dirpath, "test2.xls")) parsed = excel_data.parse("Sheet1") expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) assert_frame_equal(parsed, expected)
def _check_extension_tsframe(self, ext): path = '__tmp_to_excel_from_excel_tsframe__.' + ext df = tm.makeTimeDataFrame()[:5] with ensure_clean(path) as path: df.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(df, recons)
def test_excel_stop_iterator(self): try: import xlrd except ImportError: raise nose.SkipTest('xlrd not installed, skipping') excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa','bbbbb']], columns=['Test', 'Test1']) assert_frame_equal(parsed, expected)
def test_excel_stop_iterator(self): try: import xlrd except ImportError: raise nose.SkipTest('xlrd not installed, skipping') excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) assert_frame_equal(parsed, expected)
def test_excel_roundtrip_bool(self): _skip_if_no_openpyxl() # Test roundtrip np.bool8, does not seem to work for xls path = '__tmp_excel_roundtrip_bool__.xlsx' frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(frame, recons) os.remove(path)
def test_to_excel_periodindex(self): _skip_if_no_excelsuite() for ext in ['xls', 'xlsx']: path = '__tmp_to_excel_periodindex__.' + ext frame = self.tsframe xp = frame.resample('M', kind='period') xp.to_excel(path, 'sht1') reader = ExcelFile(path) rs = reader.parse('sht1', index_col=0, parse_dates=True) tm.assert_frame_equal(xp, rs.to_period('M')) os.remove(path)
def test_excel_roundtrip_datetime(self): _skip_if_no_xlrd() _skip_if_no_xlwt() # datetime.date, not sure what to test here exactly path = '__tmp_excel_roundtrip_datetime__.xls' tsf = self.tsframe.copy() tsf.index = [x.date() for x in self.tsframe.index] tsf.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(self.tsframe, recons) os.remove(path)
def _check_excel_multiindex_dates(self, ext): path = '__tmp_to_excel_multiindex_dates__' + ext + '__.' + ext # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_excel(path, 'test1', index_label=['time', 'foo']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) # infer index tsframe.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1') tm.assert_frame_equal(tsframe, recons) self.tsframe.index = old_index # needed if setUP becomes classmethod os.remove(path)
def test_to_excel(self): try: import xlwt import xlrd import openpyxl except ImportError: raise nose.SkipTest path = '__tmp__.xlsx' self.panel.to_excel(path) reader = ExcelFile(path) for item, df in self.panel.iteritems(): recdf = reader.parse(str(item),index_col=0) assert_frame_equal(df, recdf)
def _check_extension_sheets(self, ext): path = '__tmp_to_excel_from_excel_sheets__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1])
def test_parse_cols_str(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ['', 'x'] for s in suffix: pth = os.path.join(self.dirpath, 'test.xls%s' % s) xls = ExcelFile(pth) df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols='A:D') df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['A', 'B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols='A:D') tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2) del df, df2, df3 df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols='A,C,D') df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols='A,C,D') tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2) del df, df2, df3 df = xls.parse('Sheet1', index_col=0, parse_dates=True, parse_cols='A,C:D') df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=['B', 'C']) df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True, parse_cols='A,C:D') tm.assert_frame_equal(df, df2) tm.assert_frame_equal(df3, df2)
def test_to_excel_float_format(self): _skip_if_no_excelsuite() for ext in ['xls', 'xlsx']: filename = '__tmp_to_excel_float_format__.' + ext df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) df.to_excel(filename, 'test1', float_format='%.2f') reader = ExcelFile(filename) rs = reader.parse('test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp) os.remove(filename)
def _check_extension_int64(self, ext): path = '__tmp_to_excel_from_excel_int64__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test np.int64, values read come back as float frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np.int64) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np.int64) tm.assert_frame_equal(frame, recons, check_dtype=False)
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() _skip_if_no_xlwt() path = '%s.xls' % tm.rands(10) df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' with ensure_clean(path) as path: df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo')
def _check_extension_bool(self, ext): path = '__tmp_to_excel_from_excel_bool__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test reading/writing np.bool8, roundtrip only works for xlsx frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np.bool8) tm.assert_frame_equal(frame, recons)
def test_to_excel(self): try: import os import xlwt import xlrd import openpyxl from pandas.io.parsers import ExcelFile except ImportError: raise nose.SkipTest for ext in ['xls', 'xlsx']: path = '__tmp__.' + ext self.panel.to_excel(path) reader = ExcelFile(path) for item, df in self.panel.iteritems(): recdf = reader.parse(str(item), index_col=0) assert_frame_equal(df, recdf) os.remove(path)
def _check_extension_colaliases(self, ext): path = '__tmp_to_excel_from_excel_aliases__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_excel(path, 'test1', header=col_aliases) reader = ExcelFile(path) rs = reader.parse('test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs)
def _check_excel_multiindex(self, ext): path = '__tmp_to_excel_multiindex__' + ext + '__.' + ext frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index frame.to_excel(path, 'test1', header=False) frame.to_excel(path, 'test1', cols=['A', 'B']) # round trip frame.to_excel(path, 'test1') reader = ExcelFile(path) df = reader.parse('test1', index_col=[0, 1], parse_dates=False) tm.assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names) self.frame.index = old_index # needed if setUP becomes a classmethod os.remove(path)
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() _skip_if_no_xlwt() path = '%s.xls' % tm.rands(10) df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo') try: os.remove(path) except os.error: pass
""" Origin: QE by John Stachurski and Thomas J. Sargent Filename: wb_download.py Authors: John Stachurski, Tomohito Okabe LastModified: 29/08/2013 Dowloads data from the World Bank site on GDP per capita and plots result for a subset of countries. """ import pandas as pd import matplotlib.pyplot as plt from pandas.io.parsers import ExcelFile import urllib # == Get data and read into file gd.xls == # wb_data_file_dir = "http://api.worldbank.org/datafiles/" file_name = "GC.DOD.TOTL.GD.ZS_Indicator_MetaData_en_EXCEL.xls" url = wb_data_file_dir + file_name urllib.urlretrieve(url, "gd.xls") # == Parse data into a DataFrame == # gov_debt_xls = ExcelFile('gd.xls') govt_debt = gov_debt_xls.parse('Sheet1', index_col=1, na_values=['NA']) # == Take desired values and plot == # govt_debt = govt_debt.transpose() govt_debt = govt_debt[['AUS', 'DEU', 'FRA', 'USA']] govt_debt = govt_debt[36:] govt_debt.plot(lw=2) plt.show()
''' from BoilerPlate import * ## get unique labels as done in sklearn.metrics super annoytng def unique_labels(*lists_of_labels): """Extract an ordered array of unique labels""" labels = set().union(*(l.ravel() if hasattr(l, "ravel") else l for l in lists_of_labels)) return np.asarray(sorted(labels)) # Import data rawdata = pd.read_csv('~') from pandas.io.parsers import ExcelFile xls = ExcelFile('~') rawdata = xls.parse('~', index_col=None, na_values=['NA']) #drop na actual text rows rawdata = rawdata.dropna(subset=['Actual text']) # Extract features text = rawdata['Actual text'] #Make large word features #uses sklearn CountVectoriser/bag of words #This is fitting vocab from sklearn.feature_extraction.text import CountVectorizer vectoriser_training = CountVectorizer(min_df=1,stop_words='english',strip_accents='unicode') t = time.time() features = vectoriser_training.fit_transform(text) print "training text to word vector took", time.time()-t, "seconds"