def s3_base(worker_id): """ Fixture for mocking S3 interaction. Sets up moto server in separate process locally Return url for motoserver/moto CI service """ pytest.importorskip("s3fs") pytest.importorskip("boto3") with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/MacOS/ARM, only Ubuntu # - subprocess in CI can cause timeouts # - Github Actions do not support # container services for the above OSs # - CircleCI will probably hit the Docker rate pull limit pytest.skip("S3 tests do not have a corresponding service in " "Windows, MacOS or ARM platforms") else: yield "http://localhost:5000" else: requests = pytest.importorskip("requests") pytest.importorskip("moto", minversion="1.3.14") pytest.importorskip("flask") # server mode needs flask too # Launching moto in server mode, i.e., as a separate process # with an S3 endpoint on localhost worker_id = "5" if worker_id == "master" else worker_id.lstrip( "gw") endpoint_port = f"555{worker_id}" endpoint_uri = f"http://127.0.0.1:{endpoint_port}/" # pipe to null to avoid logging in terminal with subprocess.Popen( shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) as proc: timeout = 5 while timeout > 0: try: # OK to go once server is accepting connections r = requests.get(endpoint_uri) if r.ok: break except Exception: pass timeout -= 0.1 time.sleep(0.1) yield endpoint_uri proc.terminate()
def s3so(worker_id): if is_ci_environment(): url = "http://localhost:5000/" else: worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") url = f"http://127.0.0.1:555{worker_id}/" return {"client_kwargs": {"endpoint_url": url}}
def test_close_file_handle_on_invalid_usecols(all_parsers): # GH 45384 parser = all_parsers error = ValueError if parser.engine == "pyarrow": pyarrow = pytest.importorskip("pyarrow") error = pyarrow.lib.ArrowKeyError if is_ci_environment() and (is_platform_windows() or is_platform_mac()): # GH#45547 causes timeouts on windows/mac builds pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2") with tm.assert_produces_warning(False): with pytest.raises(error, match="col3"): parser.read_csv(fname, usecols=["col1", "col2", "col3"]) # unlink fails on windows if file handles still point to it os.unlink(fname)
def test_unsuported_compression(parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: read_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @pytest.mark.network @td.skip_if_no("s3fs") @td.skip_if_no("lxml") @pytest.mark.skipif( is_ci_environment(), reason="2022.1.17: Hanging on the CI min versions build.", ) @tm.network def test_s3_parser_consistency(): # Python Software Foundation (2019 IRS-990 RETURN) s3 = "s3://irs-form-990/201923199349319487_public.xml" df_lxml = read_xml( s3, xpath=".//irs:Form990PartVIISectionAGrp", namespaces={"irs": "http://www.irs.gov/efile"}, parser="lxml", storage_options={"anon": True}, )
# Unfortunately, Python's CSV library can't handle # tarfile objects (expects string, not bytes when # iterating through a file-like). parser = c_parser_only tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) with tarfile.open(tar_path, "r") as tar: data_file = tar.extractfile("tar_data.csv") out = parser.read_csv(data_file) expected = DataFrame({"a": [1]}) tm.assert_frame_equal(out, expected) @pytest.mark.single_cpu @pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") def test_bytes_exceed_2gb(c_parser_only): # see gh-16798 # # Read from a "CSV" that has a column larger than 2GB. parser = c_parser_only if parser.low_memory: pytest.skip("not a low_memory test") csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) df = parser.read_csv(csv) assert not df.empty
from pandas.compat import ( is_ci_environment, is_platform_mac, is_platform_windows, ) import pandas.util._test_decorators as td from pandas import ( DataFrame, Series, ) import pandas._testing as tm # TODO(GH#44584): Mark these as pytest.mark.single_cpu pytestmark = pytest.mark.skipif( is_ci_environment() and (is_platform_windows() or is_platform_mac()), reason="On GHA CI, Windows can fail with " "'Windows fatal exception: stack overflow' " "and MacOS can timeout", ) @td.skip_if_no("numba") @pytest.mark.filterwarnings("ignore:\n") class TestEWM: def test_invalid_update(self): df = DataFrame({"a": range(5), "b": range(5)}) online_ewm = df.head(2).ewm(0.5).online() with pytest.raises( ValueError, match=
class TestS3: @td.skip_if_no("s3fs") def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, compression=comp, storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp, storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: with read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, storage_options=s3so, ) as df_reader: assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them # properly. df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty true_df = tips_df.iloc[ chunksize * i_chunk : chunksize * (i_chunk + 1) ] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: with read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, engine="python", storage_options=s3so, ) as df_reader: assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them properly. df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty true_df = tips_df.iloc[ chunksize * i_chunk : chunksize * (i_chunk + 1) ] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp, storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer", storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_read_s3_fails(self, s3so): msg = "The specified bucket does not exist" with pytest.raises(OSError, match=msg): read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(OSError, match=msg): read_csv("s3://cant_get_it/file.csv") @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore # GH 34087 # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html # Catch a ClientError since AWS Service Errors are defined dynamically error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): tips_df.to_csv( "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so ) @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) @td.skip_if_no("pyarrow") def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore # GH 34087 # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html # Catch a ClientError since AWS Service Errors are defined dynamically error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): tips_df.to_parquet( "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", storage_options=s3so, ) @pytest.mark.single_cpu def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 s3_object = s3_resource.meta.client.get_object( Bucket="pandas-test", Key="tips.csv" ) with BytesIO(s3_object["Body"].read()) as buffer: result = read_csv(buffer, encoding="utf8") assert isinstance(result, DataFrame) assert not result.empty expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu @pytest.mark.skipif( is_ci_environment(), reason="This test can hang in our CI min_versions build " "and leads to '##[error]The runner has " "received a shutdown signal...' in GHA. GH: 45651", ) def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS uses 5MB chunks import s3fs df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) str_buf = StringIO() df.to_csv(str_buf) buf = BytesIO(str_buf.getvalue().encode("utf-8")) s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) # Possibly some state leaking in between tests. # If we don't clear this cache, we saw `GetObject operation: Forbidden`. # Presumably the s3fs instance is being cached, with the directory listing # from *before* we add the large-file.csv in the pandas-test bucket. s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) res = read_feather( "s3://pandas-test/simple_dataset.feather", storage_options=s3so ) tm.assert_frame_equal(expected, res)
# No non-mapping subtypes, class with pytest.raises(TypeError, match=msg): com.standardize_mapping(list) fill = {"bad": "data"} assert com.standardize_mapping(fill) == dict # Convert instance to type assert com.standardize_mapping({}) == dict dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) @pytest.mark.xfail(is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job") def test_git_version(): # GH 21295 git_version = pd.__git_version__ assert len(git_version) == 40 assert all(c in string.hexdigits for c in git_version) def test_version_tag(): version = Version(pd.__version__) try: version > Version("0.0.1") except TypeError: raise ValueError( "No git tags exist, please sync tags between upstream and your repo"
stdout = capsys.readouterr().out # check valid json is printed to the console if as_json is True result = json.loads(stdout) # Basic check that each version element is found in output expected = { "system": _get_sys_info(), "dependencies": _get_dependency_info(), } assert result == expected @pytest.mark.xfail( is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job" ) def test_show_versions_console(capsys): # gh-32041 # gh-32041 pd.show_versions(as_json=False) result = capsys.readouterr().out # check header assert "INSTALLED VERSIONS" in result # check full commit hash assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result) # check required dependency # 2020-12-09 npdev has "dirty" in the tag
@pytest.mark.xfail( reason="ufunc 'invert' not supported for the input types") def test_construct_empty_dataframe(self, dtype): super().test_construct_empty_dataframe(dtype) @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword") def test_empty(self, dtype): super().test_empty(dtype) class TestReduce(base.BaseNoReduceTests): def test_reduce_series_boolean(self): pass @pytest.mark.skipif( is_ci_environment() and is_platform_windows(), reason="Causes stack overflow on Windows CI", ) class TestReduceBoolean(base.BaseBooleanReduceTests): pass def test_is_bool_dtype(data): assert is_bool_dtype(data) assert pd.core.common.is_bool_indexer(data) s = pd.Series(range(len(data))) result = s[data] expected = s[np.asarray(data)] tm.assert_series_equal(result, expected)
class TestSafeSort: @pytest.mark.parametrize( "arg, exp", [ [[3, 1, 2, 0, 4], [0, 1, 2, 3, 4]], [list("baaacb"), np.array(list("aaabbc"), dtype=object)], [[], []], ], ) def test_basic_sort(self, arg, exp): result = safe_sort(arg) expected = np.array(exp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize( "codes, exp_codes, na_sentinel", [ [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4], -1], [[0, 1, 1, 2, 3, 0, 99, 4], [3, 1, 1, 2, 0, 3, 99, 4], 99], [[], [], -1], ], ) def test_codes(self, verify, codes, exp_codes, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel, verify=verify) expected_codes = np.array(exp_codes, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.skipif( is_platform_windows() and is_ci_environment(), reason="In CI environment can crash thread with: " "Windows fatal exception: access violation", ) @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_codes_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices codes = [0, 101, 102, 2, 3, 0, 99, 4] result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) expected_codes = np.array( [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.parametrize("box", [lambda x: np.array(x, dtype=object), list]) def test_mixed_integer(self, box): values = box(["b", 1, 0, "a", 0, "b"]) result = safe_sort(values) expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_mixed_integer_with_codes(self): values = np.array(["b", 1, 0, "a"], dtype=object) codes = [0, 1, 2, 3, 0, -1, 1] result, result_codes = safe_sort(values, codes) expected = np.array([0, 1, "a", "b"], dtype=object) expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) msg = "'[<>]' not supported between instances of .*" with pytest.raises(TypeError, match=msg): safe_sort(arr) @pytest.mark.parametrize( "arg, codes, err, msg", [ [1, None, TypeError, "Only list-like objects are allowed"], [[0, 1, 2], 1, TypeError, "Only list-like objects or None"], [[0, 1, 2, 1], [0, 1], ValueError, "values should be unique"], ], ) def test_exceptions(self, arg, codes, err, msg): with pytest.raises(err, match=msg): safe_sort(values=arg, codes=codes) @pytest.mark.parametrize( "arg, exp", [[[1, 3, 2], [1, 2, 3]], [[1, 3, np.nan, 2], [1, 2, 3, np.nan]]]) def test_extension_array(self, arg, exp): a = array(arg, dtype="Int64") result = safe_sort(a) expected = array(exp, dtype="Int64") tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_extension_array_codes(self, verify, na_sentinel): a = array([1, 3, 2], dtype="Int64") result, codes = safe_sort(a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify) expected_values = array([1, 2, 3], dtype="Int64") expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(codes, expected_codes)