def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) compressed_path = tm.get_data_path("tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df)
def test_parse_public_s3_bucket(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, compression=comp) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df)
def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path("unicode_series.csv") encoding = "latin-1" expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) actual = self.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) tm.assert_frame_equal(actual, expected) pth = tm.get_data_path("utf16_ex.txt") encoding = "utf-16" expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) actual = self.read_table(pth, encoding=encoding, dtype="category") tm.assert_frame_equal(actual, expected)
def test_parse_public_s3a_bucket(self): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def test_pickles(current_pickle_data, version, f): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) with catch_warnings(record=True): compare(current_pickle_data, vf, version)
def setUp(self): if sys.version_info < (2, 7, 0): import nose raise nose.SkipTest("Doesn't support Python 2.6 because of ElementTree incompat") self.dirpath = tm.get_data_path()
def test_parse_public_s3n_bucket(self): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def legacy_packers_versions(): # yield the packers versions path = tm.get_data_path('legacy_msgpack') for v in os.listdir(path): p = os.path.join(path, v) if os.path.isdir(p): yield v
def legacy_pickle_versions(): # yield the pickle versions path = tm.get_data_path('legacy_pickle') for v in os.listdir(path): p = os.path.join(path, v) if os.path.isdir(p): yield v
def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path('unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) actual = self.read_csv(pth, header=None, encoding=encoding, dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) pth = tm.get_data_path('utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) actual = self.read_table(pth, encoding=encoding, dtype='category') tm.assert_frame_equal(actual, expected)
def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: if comp == 'bz2' and compat.PY2: # The Python 2 C parser can't read bz2 from S3. self.assertRaises(ValueError, read_csv, 's3://pandas-test/tips.csv' + ext, compression=comp) else: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) self.assertEqual(df_reader.chunksize, chunksize) for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them # properly. df = df_reader.get_chunk() self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] # Chunking doesn't preserve row numbering true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df)
def test_12659(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "test_12659.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0)
def test_read_pickles_0_11_0(self): if not is_little_endian(): raise nose.SkipTest("known failure of test_read_pickles_0_11_0 on non-little endian") pth = tm.get_data_path('legacy_pickle/0.11.0') for f in os.listdir(pth): vf = os.path.join(pth,f) self.compare(vf)
def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def read_pickles(self, version): if not is_little_endian(): raise nose.SkipTest("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) for f in os.listdir(pth): vf = os.path.join(pth,f) self.compare(vf)
def test_url(self): # HTTP(S) url = "https://raw.github.com/pydata/pandas/master/" "pandas/io/tests/parser/data/salary.table.csv" url_table = self.read_table(url) dirpath = tm.get_data_path() localtable = os.path.join(dirpath, "salary.table.csv") local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table)
def test_airline(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "airline.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False)
def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def test_infer_s3_compression(self): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df)
def test_infer_s3_compression(self, s3_resource): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df)
def test_parse_public_s3_bucket(self): pytest.importorskip('s3fs') # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df)
def test_date_time(): # Support of different SAS date/datetime formats (PR #15871) dirpath = tm.get_data_path() fname = os.path.join(dirpath, "datetime.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) tm.assert_frame_equal(df, df0)
def test_msgpack(self): msgpack_path = tm.get_data_path('legacy_msgpack') n = 0 for v in os.listdir(msgpack_path): pth = os.path.join(msgpack_path, v) if os.path.isdir(pth): yield self.read_msgpacks, v n += 1 assert n > 0, 'Msgpack files are not tested'
def read_msgpacks(self, version): pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): vf = os.path.join(pth, f) self.compare(vf, version) n += 1 assert n > 0, 'Msgpack files are not tested'
def test_productsales(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') fname = os.path.join(dirpath, "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0)
def setUp(self): self.dirpath = tm.get_data_path() self.csv1 = os.path.join(self.dirpath, "test1.csv") self.csv2 = os.path.join(self.dirpath, "test2.csv") self.xls1 = os.path.join(self.dirpath, "test.xls") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy()
def test_pickles(self): pickle_path = tm.get_data_path('legacy_pickle') n = 0 for v in os.listdir(pickle_path): pth = os.path.join(pickle_path, v) if os.path.isdir(pth): yield self.read_pickles, v n += 1 assert n > 0, 'Pickle files are not tested'
def read_data(self, name, dedupe=False): path = os.path.join(tm.get_data_path(), name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') .reset_index(drop=True) ) x.time = to_datetime(x.time) return x
def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') self.csv15 = os.path.join(self.dirpath, 'stata6.csv') self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta')
def test_read_from_http_url(self): _skip_if_no_xlrd() url = ('https://raw.github.com/pydata/pandas/master/' 'pandas/io/tests/data/test.xlsx') url_table = read_excel(url) dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'test.xlsx') local_table = read_excel(localtable) tm.assert_frame_equal(url_table, local_table)
def test_pickles(current_pickle_data, version): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(version)) n = 0 for f in os.listdir(pth): vf = os.path.join(pth, f) data = compare(current_pickle_data, vf, version) if data is None: continue n += 1 assert n > 0, 'Pickle files are not tested'
def read_pickles(self, version): if not is_platform_little_endian(): raise nose.SkipTest("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): vf = os.path.join(pth, f) data = self.compare(vf, version) if data is None: continue n += 1 assert n > 0, 'Pickle files are not tested'
def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') self.csv15 = os.path.join(self.dirpath, 'stata6.csv') self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
def setUpClass(self): # Integration tests require a valid bigquery token # be present in the user's home directory. This # can be generated with 'bq init' in the command line self.dirpath = tm.get_data_path() home = os.path.expanduser("~") self.bq_token = os.path.join(home, '.bigquery.v2.token') self.fake_job_path = os.path.join(self.dirpath, 'gbq_fake_job.txt') # If we're using a valid token, make a test dataset # Note, dataset functionality is beyond the scope # of the module under test, so we rely on the command # line utility for this. if os.path.exists(self.bq_token): subprocess.call(['bq', 'mk', '-d', 'pandas_testing_dataset'])
def test_pickle_v0_15_2(): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], # name='foobar') # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() self.dta1 = os.path.join(self.dirpath, 'stata1.dta') self.dta2 = os.path.join(self.dirpath, 'stata2.dta') self.dta3 = os.path.join(self.dirpath, 'stata3.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4 = os.path.join(self.dirpath, 'stata4.dta') self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv')
def read_pickles(self, version): if not is_little_endian(): raise nose.SkipTest("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) for f in os.listdir(pth): vf = os.path.join(pth,f) data = self.compare(vf) if data is None: continue if 'series' in data: if 'ts' in data['series']: self._validate_timeseries(data['series']['ts'], self.data['series']['ts'])
def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() # aapl has monthlies cls.aapl = web.Options('aapl', 'yahoo') today = datetime.today() cls.year = today.year cls.month = today.month + 1 if cls.month > 12: # pragma: no cover cls.month = 1 cls.year = cls.year + 1 cls.expiry = datetime(cls.year, cls.month, 1) cls.dirpath = tm.get_data_path() cls.json1 = 'file://' + os.path.join(cls.dirpath, 'yahoo_options1.json') cls.json2 = 'file://' + os.path.join(cls.dirpath, 'yahoo_options2.json') # Empty table GH#22 cls.data1 = cls.aapl._process_data(cls.aapl._parse_url(cls.json1))
def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: if comp == 'bz2' and compat.PY2: # The Python 2 C parser can't read bz2 from S3. self.assertRaises(ValueError, read_csv, 's3://pandas-test/tips.csv' + ext, compression=comp) else: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal( read_csv(tm.get_data_path('tips.csv')).iloc[:10], df)
def test_file(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table.csv') local_table = self.read_table(localtable) try: url_table = self.read_table('file://localhost/' + localtable) except URLError: # fails on some systems raise nose.SkipTest("failing on %s" % ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table)
def boto3_client_s3(self): # see gh-16135 # boto3 is a dependency of s3fs import boto3 client = boto3.client("s3") key = "/tips.csv" bucket = "pandas-test" s3_object = client.get_object(Bucket=bucket, Key=key) result = read_csv(s3_object["Body"]) assert isinstance(result, DataFrame) assert not result.empty expected = read_csv(tm.get_data_path('tips.csv')) tm.assert_frame_equal(result, expected)
def test_pickle_v0_14_1(): # we have the name warning # 10482 with tm.assert_produces_warning(UserWarning): cat = pd.Categorical(values=['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'], name='foobar', ordered=False) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], # name='foobar') # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def test_encoding_options(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: try: df1[col] = df1[col].str.decode('utf-8') except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode())
def test_parse_public_s3_bucket_chunked_python(self): # Read with a chunksize using the Python parser chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, engine='python') self.assertEqual(df_reader.chunksize, chunksize) for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them properly. df = df_reader.get_chunk() self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them # properly. df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df)
def read_msgpacks(self, version): pth = tm.get_data_path('legacy_msgpack/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 if (compat.PY3 and version.startswith('0.17.') and f.split('.')[-4][-1] == '2'): continue vf = os.path.join(pth, f) try: self.compare(vf, version) except ImportError: # blosc not installed continue n += 1 assert n > 0, 'Msgpack files are not tested'
def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() _skip_if_no_lxml() # aapl has monthlies cls.aapl = web.Options('aapl', 'yahoo') today = datetime.today() cls.year = today.year cls.month = today.month + 1 if cls.month > 12: cls.year = cls.year + 1 cls.month = 1 cls.expiry = datetime(cls.year, cls.month, 1) cls.dirpath = tm.get_data_path() cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') cls.data1 = cls.aapl._option_frames_from_url(cls.html1)['puts']
def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() _skip_if_no_lxml() # aapl has monthlies cls.aapl = web.Options('aapl', 'yahoo') d = (Timestamp.today() + pd.offsets.MonthBegin(1)).normalize() cls.year = d.year cls.month = d.month cls.expiry = d cls.expiry2 = d + pd.offsets.MonthBegin(1) cls.dirpath = tm.get_data_path() cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') cls.html3 = os.path.join(cls.dirpath, 'yahoo_options3.html') #Empty table GH#22 cls.data1 = cls.aapl._option_frames_from_url(cls.html1)['puts']
def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() _skip_if_no_lxml() # aapl has monthlies cls.aapl = web.Options('aapl', 'yahoo') today = datetime.today() year = today.year month = today.month + 1 if month > 12: year = year + 1 month = 1 cls.expiry = datetime(year, month, 1) cls.dirpath = tm.get_data_path() cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') cls.root1 = cls.aapl._parse_url(cls.html1) cls.root2 = cls.aapl._parse_url(cls.html2)
def test_read_from_file_url(self): _skip_if_no_xlrd() # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'test.xlsx') local_table = read_excel(localtable) try: url_table = read_excel('file://localhost/' + localtable) except URLError: # fails on some systems raise nose.SkipTest("failing on %s" % ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table)
def test_msgpacks_legacy(self, current_packers_data, all_packers_data, version): pth = tm.get_data_path("legacy_msgpack/{0}".format(version)) n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 if (compat.PY3 and version.startswith("0.17.") and f.split(".")[-4][-1] == "2"): continue vf = os.path.join(pth, f) try: self.compare(current_packers_data, all_packers_data, vf, version) except ImportError: # blosc not installed continue n += 1 assert n > 0, "Msgpack files are not tested"
def test_msgpacks_legacy(self, current_packers_data, all_packers_data, version): pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 if (compat.PY3 and version.startswith('0.17.') and f.split('.')[-4][-1] == '2'): continue vf = os.path.join(pth, f) try: with catch_warnings(record=True): self.compare(current_packers_data, all_packers_data, vf, version) except ImportError: # blosc not installed continue n += 1 assert n > 0, 'Msgpack files are not tested'
def setup_class(cls): # AAPL has monthlies cls.aapl = web.Options('aapl', 'yahoo') today = datetime.today() cls.year = today.year cls.month = today.month + 1 if cls.month > 12: # pragma: no cover cls.month = 1 cls.year = cls.year + 1 cls.expiry = datetime(cls.year, cls.month, 1) cls.dirpath = tm.get_data_path() cls.json1 = 'file://' + os.path.join( cls.dirpath, 'yahoo_options1.json') # see gh-22: empty table cls.json2 = 'file://' + os.path.join( cls.dirpath, 'yahoo_options2.json') cls.data1 = cls.aapl._process_data(cls.aapl._parse_url(cls.json1))
def setUp(self): self.dirpath = tm.get_data_path() self.ts = tm.makeTimeSeries() self.ts.name = 'ts' self.series = tm.makeStringSeries() self.series.name = 'series' self.objSeries = tm.makeObjectSeries() self.objSeries.name = 'objects' self.empty_series = Series([], index=[]) self.empty_frame = DataFrame({}) self.frame = _frame.copy() self.frame2 = _frame2.copy() self.intframe = _intframe.copy() self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy()
def test_qcut_binning_issues(self): # #1978, 1979 path = os.path.join(tm.get_data_path(), 'cut_data.csv') arr = np.loadtxt(path) result = qcut(arr, 20) starts = [] ends = [] for lev in result.categories: s, e = lev[1:-1].split(',') self.assertTrue(s != e) starts.append(float(s)) ends.append(float(e)) for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])): self.assertTrue(sp < sn) self.assertTrue(ep < en) self.assertTrue(ep <= sn)
def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() _skip_if_no_lxml() # aapl has monthlies cls.aapl = web.Options('aapl', 'yahoo') today = datetime.today() year = today.year month = today.month + 1 if month > 12: year = year + 1 month = 1 cls.expiry = datetime(year, month, 1) cls.dirpath = tm.get_data_path() cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') cls.root1 = cls.aapl._parse_url(cls.html1) cls.root2 = cls.aapl._parse_url(cls.html2) cls.tables1 = cls.aapl._parse_option_page_from_yahoo(cls.root1) cls.unprocessed_data1 = web._parse_options_data( cls.tables1[cls.aapl._TABLE_LOC['puts']]) cls.data1 = cls.aapl._process_data(cls.unprocessed_data1, 'put')
def test_qcut_binning_issues(self): # #1978, 1979 path = os.path.join(tm.get_data_path(), 'cut_data.csv') arr = np.loadtxt(path) result = qcut(arr, 20) starts = [] ends = [] for lev in np.unique(result): s = lev.left e = lev.right assert s != e starts.append(float(s)) ends.append(float(e)) for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])): assert sp < sn assert ep < en assert ep <= sn
def setUp(self): self.dirpath = tm.get_data_path() self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: fname = os.path.join(self.dirpath, "test_sas7bdat_%d.csv" % j) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit='d') df["Column4"] = epoch + t1 t2 = pd.to_timedelta(df["Column12"], unit='d') df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] if col.dtype == np.int64: df.iloc[:, k] = df.iloc[:, k].astype(np.float64) elif col.dtype == np.dtype('O'): if PY2: f = lambda x: (x.decode('utf-8') if isinstance(x, str) else x) df.iloc[:, k] = df.iloc[:, k].apply(f) self.data.append(df)
def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: if comp == 'bz2' and compat.PY2: # The Python 2 C parser can't read bz2 from S3. self.assertRaises(ValueError, read_csv, 's3://pandas-test/tips.csv' + ext, compression=comp) else: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) self.assertEqual(df_reader.chunksize, chunksize) for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them # properly. df = df_reader.get_chunk() self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df)
def read_pickles(self, version): if not is_little_endian(): raise nose.SkipTest("known failure on non-little endian") pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) n = 0 for f in os.listdir(pth): vf = os.path.join(pth, f) data = self.compare(vf) if data is None: continue if 'series' in data: if 'ts' in data['series']: self._validate_timeseries(data['series']['ts'], self.data['series']['ts']) self._validate_frequency(data['series']['ts']) if 'index' in data: if 'period' in data['index']: self._validate_periodindex(data['index']['period'], self.data['index']['period']) n += 1 assert n > 0, 'Pickle files are not tested'