def test_astype_str(self): # GH4405 digits = string.digits s1 = Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]) s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0]) types = (compat.text_type, np.str_) for typ in types: for s in (s1, s2): res = s.astype(typ) expec = s.map(compat.text_type) assert_series_equal(res, expec) # GH9757 # Test str and unicode on python 2.x and just str on python 3.x for tt in set([str, compat.text_type]): ts = Series([Timestamp('2010-01-04 00:00:00')]) s = ts.astype(tt) expected = Series([tt('2010-01-04')]) assert_series_equal(s, expected) ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) s = ts.astype(tt) expected = Series([tt('2010-01-04 00:00:00-05:00')]) assert_series_equal(s, expected) td = Series([Timedelta(1, unit='d')]) s = td.astype(tt) expected = Series([tt('1 days 00:00:00.000000000')]) assert_series_equal(s, expected)
def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series(['データーサイエンス、お前はもう死んでいる']), ] former_encoding = None if sys.getdefaultencoding() == "utf-8": test_series.append(Series(['野菜食べないとやばい'.encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(str) tm.assert_series_equal(res, expec) # Restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) sys.setdefaultencoding(former_encoding)
def test_astype_unicode(self): # GH7758 # a bit of magic is required to set default encoding encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series([u('データーサイエンス、お前はもう死んでいる')]), ] former_encoding = None if not compat.PY3: # in python we can force the default encoding for this test former_encoding = sys.getdefaultencoding() reload(sys) # noqa sys.setdefaultencoding("utf-8") if sys.getdefaultencoding() == "utf-8": test_series.append(Series([u('野菜食べないとやばい').encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(compat.text_type) assert_series_equal(res, expec) # restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) # noqa sys.setdefaultencoding(former_encoding)
def test_repr_truncation(self): max_len = 20 with option_context("display.max_colwidth", max_len): df = DataFrame( { "A": np.random.randn(10), "B": [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)], } ) r = repr(df) r = r[r.find("\n") + 1 :] _strlen = fmt._strlen_func() for line, value in zip(r.split("\n"), df["B"]): if _strlen(value) + 1 > max_len: self.assert_("..." in line) else: self.assert_("..." not in line) with option_context("display.max_colwidth", 999999): self.assert_("..." not in repr(df)) with option_context("display.max_colwidth", max_len + 2): self.assert_("..." not in repr(df))
def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH4982 df = DataFrame( { "dates1": date_range("20010101", periods=10), "dates2": date_range("20010102", periods=10), "intcol": np.random.randint(1000000000, size=10), "floatcol": np.random.randn(10), "stringcol": list(tm.rands(10)), } ) df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats expected = left_f(df, Timestamp("20010109")) result = right_f(Timestamp("20010109"), df) assert_frame_equal(result, expected) # nats expected = left_f(df, Timestamp("nat")) result = right_f(Timestamp("nat"), df) assert_frame_equal(result, expected)
def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH4982 df = DataFrame({ 'dates1': date_range('20010101', periods=10), 'dates2': date_range('20010102', periods=10), 'intcol': np.random.randint(1000000000, size=10), 'floatcol': np.random.randn(10), 'stringcol': list(tm.rands(10)) }) df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT ops = { 'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', 'ne': 'ne' } for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats expected = left_f(df, Timestamp('20010109')) result = right_f(Timestamp('20010109'), df) assert_frame_equal(result, expected) # nats expected = left_f(df, Timestamp('nat')) result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected)
def generate_dict_strings(string_size, nunique, length, random_order=True): uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') if random_order: indices = np.random.randint(0, nunique, size=length).astype('i4') else: indices = np.arange(nunique).astype('i4').repeat(length // nunique) return pa.DictionaryArray.from_arrays(indices, uniques)
def generate_strings(string_size, nunique, length, random_order=True): uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') if random_order: indices = np.random.randint(0, nunique, size=length).astype('i4') return uniques.take(indices) else: return uniques.repeat(length // nunique)
def test_repr_truncation(self): max_len = 20 with option_context("display.max_colwidth", max_len): df = DataFrame({ 'A': np.random.randn(10), 'B': [ tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10) ] }) r = repr(df) r = r[r.find('\n') + 1:] _strlen = fmt._strlen_func() for line, value in zip(r.split('\n'), df['B']): if _strlen(value) + 1 > max_len: self.assert_('...' in line) else: self.assert_('...' not in line) with option_context("display.max_colwidth", 999999): self.assert_('...' not in repr(df)) with option_context("display.max_colwidth", max_len + 2): self.assert_('...' not in repr(df))
def test_wide_repr_multiindex_cols(self): set_option('test.interactive', True) col = lambda l, k: [tm.rands(k) for _ in xrange(l)] midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), np.array(col(10, 5))]) mcols = pandas.MultiIndex.from_arrays([np.array(col(20, 3)), np.array(col(20, 3))]) df = DataFrame([col(20, 25) for _ in range(10)], index=midx, columns=mcols) df.index.names = ['Level 0', 'Level 1'] set_option('print.expand_frame_repr', False) rep_str = repr(df) set_option('print.expand_frame_repr', True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) set_option('print.line_width', 120) wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1) reset_option('print.expand_frame_repr') set_option('test.interactive', False) set_option('print.line_width', 80)
def test_wide_repr_multiindex_cols(self): with option_context('mode.sim_interactive', True): col = lambda l, k: [tm.rands(k) for _ in xrange(l)] midx = pandas.MultiIndex.from_arrays( [np.array(col(10, 5)), np.array(col(10, 5))]) mcols = pandas.MultiIndex.from_arrays( [np.array(col(20, 3)), np.array(col(20, 3))]) df = DataFrame([col(20, 25) for _ in range(10)], index=midx, columns=mcols) df.index.names = ['Level 0', 'Level 1'] set_option('display.expand_frame_repr', False) rep_str = repr(df) set_option('display.expand_frame_repr', True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) with option_context('display.line_width', 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1) reset_option('display.expand_frame_repr')
def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 df = pd.DataFrame( { "dates1": pd.date_range("20010101", periods=10), "dates2": pd.date_range("20010102", periods=10), "intcol": np.random.randint(1000000000, size=10), "floatcol": np.random.randn(10), "stringcol": list(tm.rands(10)), } ) df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats if left in ["eq", "ne"]: expected = left_f(df, pd.Timestamp("20010109")) result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: with pytest.raises(TypeError): left_f(df, pd.Timestamp("20010109")) with pytest.raises(TypeError): right_f(pd.Timestamp("20010109"), df) # nats expected = left_f(df, pd.Timestamp("nat")) result = right_f(pd.Timestamp("nat"), df) tm.assert_frame_equal(result, expected)
def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH4982 df = DataFrame({'dates1': date_range('20010101', periods=10), 'dates2': date_range('20010102', periods=10), 'intcol': np.random.randint(1000000000, size=10), 'floatcol': np.random.randn(10), 'stringcol': list(tm.rands(10))}) df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats expected = left_f(df, Timestamp('20010109')) result = right_f(Timestamp('20010109'), df) assert_frame_equal(result, expected) # nats expected = left_f(df, Timestamp('nat')) result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected)
def test_roundtrip_indexlabels(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_indexlabels__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = '%s.%s' % (tm.rands(10), ext) with ensure_clean(path) as path: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def setUp(self): from pandas.io.tests.generate_legacy_storage_files import ( create_msgpack_data, create_data) self.data = create_msgpack_data() self.all_data = create_data() self.path = u('__%s__.msgpack' % tm.rands(10)) self.minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'], 'frame': ['float', 'int', 'mixed', 'mi'], 'panel': ['float'], 'index': ['int', 'date', 'period'], 'mi': ['reg2']}
def setup(self): nuniques = 100000 value_size = 50 length = 1000000 num_cols = 10 unique_values = np.array([rands(value_size) for i in range(nuniques)], dtype='O') values = unique_values[np.random.randint(0, nuniques, size=length)] self.table = pa.table([pa.array(values) for i in range(num_cols)], names=['f{}'.format(i) for i in range(num_cols)]) self.table_df = self.table.to_pandas()
def _check_extension_indexlabels(self, ext): path = '__tmp_to_excel_from_excel_indexlabels__.' + ext try: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) finally: os.remove(path) # test index_labels in same row as column names path = '%s.xls' % tm.rands(10) try: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons) finally: os.remove(path)
def test_roundtrip_indexlabels(self): _skip_if_no_xlrd() ext = self.ext path = '__tmp_to_excel_from_excel_indexlabels__.' + ext with ensure_clean(path) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel( path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = '%s.%s' % (tm.rands(10), ext) with ensure_clean(path) as path: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def generate_csv_files(): N = 10 nfiles = 10 df = pd.DataFrame({'foo': [tm.rands(10) for _ in xrange(N)], 'bar': np.random.randn(N), 'baz': np.random.randint(0, 100, size=N)}, columns=['foo', 'bar', 'baz']) csv_base = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'csv') os.mkdir(csv_base) for i in xrange(nfiles): csv_path = pjoin(csv_base, '{0}.csv'.format(i)) print('Writing {0}'.format(csv_path)) df.to_csv(csv_path, index=False, header=False)
def setUp(self): from pandas.io.tests.generate_legacy_storage_files import create_msgpack_data, create_data self.data = create_msgpack_data() self.all_data = create_data() self.path = u("__%s__.msgpack" % tm.rands(10)) self.minimum_structure = { "series": ["float", "int", "mixed", "ts", "mi", "dup"], "frame": ["float", "int", "mixed", "mi"], "panel": ["float"], "index": ["int", "date", "period"], "mi": ["reg2"], }
def test_series_frame_radd_bug(self): from pandas.util.testing import rands # GH 353 vals = Series([rands(5) for _ in xrange(10)]) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) assert_series_equal(result, expected) frame = DataFrame({'vals' : vals}) result = 'foo_' + frame expected = DataFrame({'vals' : vals.map(lambda x: 'foo_' + x)}) tm.assert_frame_equal(result, expected)
def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = np.array([rands(10) for _ in xrange(10000)], dtype="O") key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)}) df2 = DataFrame({"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}) # just to hit the label compression code path merged = merge(df, df2, how="outer")
def _check_extension_indexlabels(self, ext): path = '__tmp_to_excel_from_excel_indexlabels__.' + ext try: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # test index_label frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label=['test']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel( path, 'test1', index_label=['test', 'dummy', 'dummy2']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) frame = (DataFrame(np.random.randn(10, 2)) >= 0) frame.to_excel(path, 'test1', index_label='test') reader = ExcelFile(path) recons = reader.parse('test1', index_col=0).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) finally: os.remove(path) # test index_labels in same row as column names path = '%s.xls' % tm.rands(10) try: self.frame.to_excel(path, 'test1', cols=['A', 'B', 'C', 'D'], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(path) recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons) finally: os.remove(path)
def test_wide_repr(self): with option_context('mode.sim_interactive', True): col = lambda l, k: [tm.rands(k) for _ in xrange(l)] df = DataFrame([col(20, 25) for _ in range(10)]) set_option('display.expand_frame_repr', False) rep_str = repr(df) set_option('display.expand_frame_repr', True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) with option_context('display.line_width', 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) reset_option('display.expand_frame_repr')
def test_leak3(): import pyarrow.parquet as pq df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] for i in range(50)}) table = pa.Table.from_pandas(df, preserve_index=False) writer = pq.ParquetWriter('leak_test_' + tm.rands(5) + '.parquet', table.schema) def func(): writer.write_table(table, row_group_size=len(table)) # This does not "leak" per se but we do want to have this use as little # memory as possible assert_does_not_leak(func, iterations=500, check_interval=50, tolerance=20)
def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series(['データーサイエンス、お前はもう死んでいる']), ] former_encoding = None if sys.getdefaultencoding() == "utf-8": test_series.append(Series(['野菜食べないとやばい' .encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(str) tm.assert_series_equal(res, expec) # Restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) sys.setdefaultencoding(former_encoding)
def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = np.array([rands(10) for _ in xrange(10000)], dtype='O') key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({'key1' : key1, 'key2' : key2, 'value1' : np.random.randn(20000)}) df2 = DataFrame({'key1' : key1[::2], 'key2' : key2[::2], 'value2' : np.random.randn(10000)}) # just to hit the label compression code path merged = merge(df, df2, how='outer')
def _check_extension_indexlabels(self, ext): path = "__tmp_to_excel_from_excel_indexlabels__." + ext with ensure_clean(path) as path: self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # test index_label frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label=["test", "dummy", "dummy2"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) frame = DataFrame(np.random.randn(10, 2)) >= 0 frame.to_excel(path, "test1", index_label="test") reader = ExcelFile(path) recons = reader.parse("test1", index_col=0).astype(np.int64) frame.index.names = ["test"] self.assertEqual(frame.index.names, recons.index.names) # test index_labels in same row as column names path = "%s.xls" % tm.rands(10) with ensure_clean(path) as path: self.frame.to_excel(path, "test1", cols=["A", "B", "C", "D"], index=False) # take 'A' and 'B' as indexes (they are in same row as cols 'C', # 'D') df = self.frame.copy() df = df.set_index(["A", "B"]) reader = ExcelFile(path) recons = reader.parse("test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons)
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() path = '%s.%s' % (tm.rands(10), self.ext) df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' with ensure_clean(path) as path: df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo')
def _test_dataframe(size=10000, seed=0): np.random.seed(seed) df = pd.DataFrame({ 'uint8': _random_integers(size, np.uint8), 'uint16': _random_integers(size, np.uint16), 'uint32': _random_integers(size, np.uint32), 'uint64': _random_integers(size, np.uint64), 'int8': _random_integers(size, np.int8), 'int16': _random_integers(size, np.int16), 'int32': _random_integers(size, np.int32), 'int64': _random_integers(size, np.int64), 'float32': np.random.randn(size).astype(np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': [tm.rands(10) for i in range(size)] }) return df
def test_wide_repr(self): set_option('test.interactive', True) col = lambda l, k: [tm.rands(k) for _ in xrange(l)] df = DataFrame([col(20, 25) for _ in range(10)]) set_option('print.expand_frame_repr', False) rep_str = repr(df) set_option('print.expand_frame_repr', True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) set_option('print.line_width', 120) wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) reset_option('print.expand_frame_repr') set_option('test.interactive', False) set_option('print.line_width', 80)
def test_repr_truncation(self): max_len = 20 fmt.print_config.max_colwidth = max_len df = DataFrame( {"A": np.random.randn(10), "B": [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)]} ) r = repr(df) r = r[r.find("\n") + 1 :] for line, value in zip(r.split("\n"), df["B"]): if fmt._strlen(value) + 1 > max_len: self.assert_("..." in line) else: self.assert_("..." not in line) fmt.print_config.max_colwidth = None self.assert_("..." not in repr(df)) fmt.print_config.max_colwidth = max_len + 2 self.assert_("..." not in repr(df))
def test_repr_truncation(self): max_len = 20 fmt.print_config.max_colwidth = max_len df = DataFrame({'A': np.random.randn(10), 'B': [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)]}) r = repr(df) r = r[r.find('\n') + 1:] for line, value in zip(r.split('\n'), df['B']): if fmt._strlen(value) + 1 > max_len: self.assert_('...' in line) else: self.assert_('...' not in line) fmt.print_config.max_colwidth = None self.assert_('...' not in repr(df)) fmt.print_config.max_colwidth = max_len + 2 self.assert_('...' not in repr(df))
def test_wide_repr_multiindex_cols(self): with option_context("mode.sim_interactive", True): col = lambda l, k: [tm.rands(k) for _ in xrange(l)] midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), np.array(col(10, 5))]) mcols = pandas.MultiIndex.from_arrays([np.array(col(20, 3)), np.array(col(20, 3))]) df = DataFrame([col(20, 25) for _ in range(10)], index=midx, columns=mcols) df.index.names = ["Level 0", "Level 1"] set_option("display.expand_frame_repr", False) rep_str = repr(df) set_option("display.expand_frame_repr", True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) with option_context("display.line_width", 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) self.assert_(len(wide_repr.splitlines()) == 14 * 10 - 1) reset_option("display.expand_frame_repr")
def test_wide_repr_multiindex(self): with option_context("mode.sim_interactive", True): col = lambda l, k: [tm.rands(k) for _ in xrange(l)] midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), np.array(col(10, 5))]) df = DataFrame([col(20, 25) for _ in range(10)], index=midx) df.index.names = ["Level 0", "Level 1"] set_option("print.expand_frame_repr", False) rep_str = repr(df) set_option("print.expand_frame_repr", True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) with option_context("print.line_width", 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: self.assert_("Level 0 Level 1" in line) reset_option("print.expand_frame_repr")
def test_wide_repr_named(self): with option_context('mode.sim_interactive', True): col = lambda l, k: [tm.rands(k) for _ in xrange(l)] df = DataFrame([col(20, 25) for _ in range(10)]) df.index.name = 'DataFrame Index' set_option('display.expand_frame_repr', False) rep_str = repr(df) set_option('display.expand_frame_repr', True) wide_repr = repr(df) self.assert_(rep_str != wide_repr) with option_context('display.line_width', 120): wider_repr = repr(df) self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: self.assert_('DataFrame Index' in line) reset_option('display.expand_frame_repr')
def _test_dataframe(size=10000, seed=0): np.random.seed(seed) df = pd.DataFrame({ 'uint8': _random_integers(size, np.uint8), 'uint16': _random_integers(size, np.uint16), 'uint32': _random_integers(size, np.uint32), 'uint64': _random_integers(size, np.uint64), 'int8': _random_integers(size, np.int8), 'int16': _random_integers(size, np.int16), 'int32': _random_integers(size, np.int32), 'int64': _random_integers(size, np.int64), 'float32': np.random.randn(size).astype(np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': [tm.rands(10) for i in range(size)], 'all_none': [None] * size, 'all_none_category': [None] * size }) # TODO(PARQUET-1015) # df['all_none_category'] = df['all_none_category'].astype('category') return df
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() _skip_if_no_xlwt() path = '%s.xls' % tm.rands(10) df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo') try: os.remove(path) except os.error: pass
def test_excel_roundtrip_indexname(self): _skip_if_no_xlrd() _skip_if_no_xlwt() path = "%s.xls" % tm.rands(10) df = DataFrame(np.random.randn(10, 4)) df.index.name = "foo" df.to_excel(path) xf = ExcelFile(path) result = xf.parse(xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, "foo") try: os.remove(path) except os.error: pass
def test_utf16_bom_skiprows(self): # #2298 data = u("""skip this skip this too A\tB\tC 1\t2\t3 4\t5\t6""") data2 = u("""skip this skip this too A,B,C 1,2,3 4,5,6""") path = '__%s__.csv' % tm.rands(10) with tm.ensure_clean(path) as path: for sep, dat in [('\t', data), (',', data2)]: for enc in ['utf-16', 'utf-16le', 'utf-16be']: bytes = dat.encode(enc) with open(path, 'wb') as f: f.write(bytes) s = BytesIO(dat.encode('utf-8')) if compat.PY3: # somewhat False since the code never sees bytes from io import TextIOWrapper s = TextIOWrapper(s, encoding='utf-8') result = self.read_csv(path, encoding=enc, skiprows=2, sep=sep) expected = self.read_csv(s, encoding='utf-8', skiprows=2, sep=sep) s.close() tm.assert_frame_equal(result, expected)
def test_strided_data_import(self): cases = [] columns = ['a', 'b', 'c'] N, K = 100, 3 random_numbers = np.random.randn(N, K).copy() * 100 numeric_dtypes = [ 'i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f4', 'f8' ] for type_name in numeric_dtypes: cases.append(random_numbers.astype(type_name)) # strings cases.append( np.array([tm.rands(10) for i in range(N * K)], dtype=object).reshape(N, K).copy()) # booleans boolean_objects = (np.array([True, False, True] * N, dtype=object).reshape(N, K).copy()) # add some nulls, so dtype comes back as objects boolean_objects[5] = None cases.append(boolean_objects) cases.append( np.arange("2016-01-01T00:00:00.001", N * K, dtype='datetime64[ms]').reshape(N, K).copy()) strided_mask = (random_numbers > 0).astype(bool)[:, 0] for case in cases: df = pd.DataFrame(case, columns=columns) col = df['a'] self._check_pandas_roundtrip(df) self._check_array_roundtrip(col) self._check_array_roundtrip(col, mask=strided_mask)
def test_repr_truncation(self): max_len = 20 set_option("print.max_colwidth", max_len) df = DataFrame({'A': np.random.randn(10), 'B': [tm.rands(np.random.randint(max_len - 1, max_len + 1)) for i in range(10)]}) r = repr(df) r = r[r.find('\n') + 1:] _strlen = fmt._strlen_func() for line, value in zip(r.split('\n'), df['B']): if _strlen(value) + 1 > max_len: self.assert_('...' in line) else: self.assert_('...' not in line) set_option("print.max_colwidth", 999999) self.assert_('...' not in repr(df)) set_option("print.max_colwidth", max_len + 2) self.assert_('...' not in repr(df))
def test_strided_data_import(self): cases = [] columns = ['a', 'b', 'c'] N, K = 100, 3 random_numbers = np.random.randn(N, K).copy() * 100 numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', 'f4', 'f8'] for type_name in numeric_dtypes: cases.append(random_numbers.astype(type_name)) # strings cases.append(np.array([tm.rands(10) for i in range(N * K)], dtype=object) .reshape(N, K).copy()) # booleans boolean_objects = (np.array([True, False, True] * N, dtype=object) .reshape(N, K).copy()) # add some nulls, so dtype comes back as objects boolean_objects[5] = None cases.append(boolean_objects) cases.append(np.arange("2016-01-01T00:00:00.001", N * K, dtype='datetime64[ms]') .reshape(N, K).copy()) strided_mask = (random_numbers > 0).astype(bool)[:, 0] for case in cases: df = pd.DataFrame(case, columns=columns) col = df['a'] self._check_pandas_roundtrip(df) self._check_array_roundtrip(col) self._check_array_roundtrip(col, mask=strided_mask)
def test_utf16_bom_skiprows(self): # #2298 data = u( """skip this skip this too A\tB\tC 1\t2\t3 4\t5\t6""" ) data2 = u( """skip this skip this too A,B,C 1,2,3 4,5,6""" ) path = "__%s__.csv" % tm.rands(10) with tm.ensure_clean(path) as path: for sep, dat in [("\t", data), (",", data2)]: for enc in ["utf-16", "utf-16le", "utf-16be"]: bytes = dat.encode(enc) with open(path, "wb") as f: f.write(bytes) s = BytesIO(dat.encode("utf-8")) if compat.PY3: # somewhat False since the code never sees bytes from io import TextIOWrapper s = TextIOWrapper(s, encoding="utf-8") result = self.read_csv(path, encoding=enc, skiprows=2, sep=sep) expected = self.read_csv(s, encoding="utf-8", skiprows=2, sep=sep) s.close() tm.assert_frame_equal(result, expected)
def get_random_path(): return u'__%s__.pickle' % tm.rands(10)
class TestSeriesDtypes: def test_dt64_series_astype_object(self): dt64ser = Series(date_range('20130101', periods=3)) result = dt64ser.astype(object) assert isinstance(result.iloc[0], datetime) assert result.dtype == np.object_ def test_td64_series_astype_object(self): tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') result = tdser.astype(object) assert isinstance(result.iloc[0], timedelta) assert result.dtype == np.object_ @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) def test_astype(self, dtype): s = Series(np.random.randn(5), name='foo') as_typed = s.astype(dtype) assert as_typed.dtype == dtype assert as_typed.name == s.name def test_asobject_deprecated(self): s = Series(np.random.randn(5), name='foo') with tm.assert_produces_warning(FutureWarning): o = s.asobject assert isinstance(o, np.ndarray) def test_dtype(self, datetime_series): assert datetime_series.dtype == np.dtype('float64') assert datetime_series.dtypes == np.dtype('float64') # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): assert datetime_series.ftype == 'float64:dense' # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert datetime_series.ftypes == 'float64:dense' tm.assert_series_equal(datetime_series.get_dtype_counts(), Series(1, ['float64'])) # GH18243 - Assert .get_ftype_counts is deprecated with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(datetime_series.get_ftype_counts(), Series(1, ['float64:dense'])) @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) def test_astype_cast_nan_inf_int(self, dtype, value): # gh-14265: check NaN and inf raise error when converting to int msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer' s = Series([value]) with pytest.raises(ValueError, match=msg): s.astype(dtype) @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) def test_astype_cast_object_int_fail(self, dtype): arr = Series(["car", "house", "tree", "1"]) msg = r"invalid literal for int\(\) with base 10: 'car'" with pytest.raises(ValueError, match=msg): arr.astype(dtype) def test_astype_cast_object_int(self): arr = Series(['1', '2', '3', '4'], dtype=object) result = arr.astype(int) tm.assert_series_equal(result, Series(np.arange(1, 5))) def test_astype_datetime(self): s = Series(iNaT, dtype='M8[ns]', index=range(5)) s = s.astype('O') assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0)]) s = s.astype('O') assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) s[1] = np.nan assert s.dtype == 'M8[ns]' s = s.astype('O') assert s.dtype == np.object_ def test_astype_datetime64tz(self): s = Series(date_range('20130101', periods=3, tz='US/Eastern')) # astype result = s.astype(object) expected = Series(s.astype(object), dtype=object) tm.assert_series_equal(result, expected) result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) tm.assert_series_equal(result, s) # astype - object, preserves on construction result = Series(s.astype(object)) expected = s.astype(object) tm.assert_series_equal(result, expected) # astype - datetime64[ns, tz] result = Series(s.values).astype('datetime64[ns, US/Eastern]') tm.assert_series_equal(result, s) result = Series(s.values).astype(s.dtype) tm.assert_series_equal(result, s) result = s.astype('datetime64[ns, CET]') expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, np.str_]) @pytest.mark.parametrize("series", [ Series( [string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]) ]) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(str) tm.assert_series_equal(result, expected) def test_astype_str_cast(self): # see gh-9757 ts = Series([Timestamp('2010-01-04 00:00:00')]) s = ts.astype(str) expected = Series([str('2010-01-04')]) tm.assert_series_equal(s, expected) ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) s = ts.astype(str) expected = Series([str('2010-01-04 00:00:00-05:00')]) tm.assert_series_equal(s, expected) td = Series([Timedelta(1, unit='d')]) s = td.astype(str) expected = Series([str('1 days 00:00:00.000000000')]) tm.assert_series_equal(s, expected) def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), Series(['データーサイエンス、お前はもう死んでいる']), ] former_encoding = None if sys.getdefaultencoding() == "utf-8": test_series.append(Series(['野菜食べないとやばい'.encode("utf-8")])) for s in test_series: res = s.astype("unicode") expec = s.map(str) tm.assert_series_equal(res, expec) # Restore the former encoding if former_encoding is not None and former_encoding != "utf-8": reload(sys) sys.setdefaultencoding(former_encoding) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # see gh-7271 s = Series(range(0, 10, 2), name='abc') dt1 = dtype_class({'abc': str}) result = s.astype(dt1) expected = Series(['0', '2', '4', '6', '8'], name='abc') tm.assert_series_equal(result, expected) dt2 = dtype_class({'abc': 'float64'}) result = s.astype(dt2) expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', name='abc') tm.assert_series_equal(result, expected) dt3 = dtype_class({'abc': str, 'def': str}) msg = ("Only the Series name can be used for the key in Series dtype" r" mappings\.") with pytest.raises(KeyError, match=msg): s.astype(dt3) dt4 = dtype_class({0: str}) with pytest.raises(KeyError, match=msg): s.astype(dt4) # GH16717 # if dtypes provided is empty, it should error dt5 = dtype_class({}) with pytest.raises(KeyError, match=msg): s.astype(dt5) def test_astype_categories_deprecation(self): # deprecated 17636 s = Series(['a', 'b', 'a']) expected = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype('category', categories=['a', 'b'], ordered=True) tm.assert_series_equal(result, expected) def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) exp = Series(Categorical(items)) res = s.astype('category') tm.assert_series_equal(res, exp) items = [1, 2, 3, 1] s = Series(items) exp = Series(Categorical(items)) res = s.astype('category') tm.assert_series_equal(res, exp) df = DataFrame({ "cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6] }) cats = Categorical([1, 2, 3, 4, 5, 6]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) df = DataFrame({ "cats": ['a', 'b', 'b', 'a', 'a', 'd'], "vals": [1, 2, 3, 4, 5, 6] }) cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) # with keywords lst = ["a", "b", "c", "a"] s = Series(lst) exp = Series(Categorical(lst, ordered=True)) res = s.astype(CategoricalDtype(None, ordered=True)) tm.assert_series_equal(res, exp) exp = Series(Categorical(lst, categories=list('abcdef'), ordered=True)) res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) tm.assert_series_equal(res, exp) def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) df = DataFrame({'value': value}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = (r"could not convert string to float|" r"invalid literal for float\(\)") with pytest.raises(ValueError, match=msg): s.astype('float64') cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name='value_group') cmp(s.astype('object'), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion for valid in [ lambda x: x.astype('category'), lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype(CategoricalDtype()) ]: result = valid(s) # compare series values # internal .categories can't be compared because it is sorted tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\." "Categorical'> for astype") for invalid in [ lambda x: x.astype(Categorical), lambda x: x.astype('object').astype(Categorical) ]: with pytest.raises(TypeError, match=msg): invalid(s) @pytest.mark.parametrize('name', [None, 'foo']) @pytest.mark.parametrize('dtype_ordered', [True, False]) @pytest.mark.parametrize('series_ordered', [True, False]) def test_astype_categorical_to_categorical(self, name, dtype_ordered, series_ordered): # GH 10696/18593 s_data = list('abcaacbab') s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) s = Series(s_data, dtype=s_dtype, name=name) # unspecified categories dtype = CategoricalDtype(ordered=dtype_ordered) result = s.astype(dtype) exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) expected = Series(s_data, name=name, dtype=exp_dtype) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype('category', ordered=dtype_ordered) tm.assert_series_equal(result, expected) # different categories dtype = CategoricalDtype(list('adc'), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.astype('category', categories=list('adc'), ordered=dtype_ordered) tm.assert_series_equal(result, expected) if dtype_ordered is False: # not specifying ordered, so only test once expected = s result = s.astype('category') tm.assert_series_equal(result, expected) def test_astype_categoricaldtype(self): s = Series(['a', 'b', 'a']) result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) tm.assert_series_equal(result, expected) result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) expected = Series( Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'], ordered=False)) tm.assert_series_equal(result, expected) tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) def test_astype_categoricaldtype_with_args(self): s = Series(['a', 'b']) type_ = CategoricalDtype(['a', 'b']) msg = (r"Cannot specify a CategoricalDtype and also `categories` or" r" `ordered`\. Use `dtype=CategoricalDtype\(categories," r" ordered\)` instead\.") with pytest.raises(TypeError, match=msg): s.astype(type_, ordered=True) with pytest.raises(TypeError, match=msg): s.astype(type_, categories=['a', 'b']) with pytest.raises(TypeError, match=msg): s.astype(type_, categories=['a', 'b'], ordered=False) @pytest.mark.parametrize("dtype", [ np.datetime64, np.timedelta64, ]) def test_astype_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 data = [1] s = Series(data) msg = ((r"The '{dtype}' dtype has no unit\. " r"Please pass in '{dtype}\[ns\]' instead.").format( dtype=dtype.__name__)) with pytest.raises(ValueError, match=msg): s.astype(dtype) @pytest.mark.parametrize("dtype", np.typecodes['All']) def test_astype_empty_constructor_equality(self, dtype): # see gh-15524 if dtype not in ( "S", "V", # poor support (if any) currently "M", "m" # Generic timestamps raise a ValueError. Already tested. ): init_empty = Series([], dtype=dtype) as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) @pytest.mark.filterwarnings('ignore::FutureWarning') def test_complex(self): # see gh-4819: complex access for ndarray compat a = np.arange(5, dtype=np.float64) b = Series(a + 4j * a) tm.assert_numpy_array_equal(a, np.real(b)) tm.assert_numpy_array_equal(4 * a, np.imag(b)) b.real = np.arange(5) + 5 tm.assert_numpy_array_equal(a + 5, np.real(b)) tm.assert_numpy_array_equal(4 * a, np.imag(b)) def test_real_imag_deprecated(self): # GH 18262 s = pd.Series([1]) with tm.assert_produces_warning(FutureWarning): s.imag s.real def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) msg = (r"Expected value of kwarg 'errors' to be one of \['raise'," r" 'ignore'\]\. Supplied value is 'False'") with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) s.astype(np.int8, errors='raise') def test_intercept_astype_object(self): series = Series(date_range('1/1/2000', periods=10)) # This test no longer makes sense, as # Series is by default already M8[ns]. expected = series.astype('object') df = DataFrame({'a': series, 'b': np.random.randn(len(series))}) exp_dtypes = Series([np.dtype('datetime64[ns]'), np.dtype('float64')], index=['a', 'b']) tm.assert_series_equal(df.dtypes, exp_dtypes) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() df = DataFrame({'a': series, 'b': ['foo'] * len(series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical series = Series(['a', 'b', 'c']) result = Series(series, dtype='category') expected = Series(['a', 'b', 'c'], dtype='category') tm.assert_series_equal(result, expected) def test_infer_objects_series(self): # GH 11221 actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() expected = Series([1, 2, 3]) tm.assert_series_equal(actual, expected) actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() expected = Series([1., 2., 3., np.nan]) tm.assert_series_equal(actual, expected) # only soft conversions, unconvertable pass thru unchanged actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')).infer_objects()) expected = Series([1, 2, 3, None, 'a']) assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) def test_is_homogeneous_type(self): assert Series()._is_homogeneous_type assert Series([1, 2])._is_homogeneous_type assert Series(pd.Categorical([1, 2]))._is_homogeneous_type @pytest.mark.parametrize("data", [ pd.period_range("2000", periods=4), pd.IntervalIndex.from_breaks([1, 2, 3, 4]) ]) def test_values_compatibility(self, data): # https://github.com/pandas-dev/pandas/issues/23995 result = pd.Series(data).values expected = np.array(data.astype(object)) tm.assert_numpy_array_equal(result, expected)
def get_random_path(): return "__{}__.pickle".format(tm.rands(10))
def test_rands(): r = tm.rands(10) assert (len(r) == 10)
def setup(self, uniqueness, total): nunique = int(total * uniqueness) unique_values = [tm.rands(self.string_length) for i in range(nunique)] values = unique_values * (total // nunique) self.arr = pa.array(values, type=pa.string()) self.table = pa.Table.from_arrays([self.arr], ['f0'])