def test_get_csv_with_a_cache_does_not_reload_file(source_file2: SourceFile): source_file2.get_csv_as_df(apply_dtypes=False, cache=True) source_file2._retrieve_cached_df = MagicMock() source_file2._read_csv_as_df = MagicMock() source_file2.get_csv_as_df(apply_dtypes=False, cache=True) source_file2._retrieve_cached_df.assert_called_once() source_file2._read_csv_as_df.assert_not_called()
def test_modifying_returned_df_does_not_affect_cached_df( source_file2: SourceFile): df = source_file2.get_csv_as_df(apply_dtypes=False, cache=True) df.drop(labels='column_a', axis=1, inplace=True) assert 'column_a' not in df.columns df = source_file2.get_csv_as_df(apply_dtypes=False, cache=True) assert 'column_a' in df.columns
def test_force_reload_ignores_cache(source_file2: SourceFile): source_file2.get_csv_as_df(apply_dtypes=False, cache=True) source_file2._retrieve_cached_df = MagicMock() source_file2._read_csv_as_df = MagicMock() source_file2.get_csv_as_df(apply_dtypes=False, cache=True, force_reload=True) source_file2._retrieve_cached_df.assert_not_called() source_file2._read_csv_as_df.assert_called_once()
def test_source_file2_has_config_dtypes(source_file2: SourceFile): df = source_file2.get_csv_as_df(apply_dtypes=True) expected_dtypes = { 'column_a': dtype('O'), 'column_b': pd.Int64Dtype(), 'column_c': dtype('<M8[ns]'), 'column_d': dtype('float64'), } assert df.dtypes.to_dict() == expected_dtypes
def test_source_file2_apply_dtypes_manually(source_file2: SourceFile): df = source_file2.get_csv_as_df(apply_dtypes=False) assert list(df.dtypes) == [dtype('O')] * 4 df = source_file2.apply_dtypes(df, errors='raise') expected_dtypes = { 'column_a': dtype('O'), 'column_b': pd.Int64Dtype(), 'column_c': dtype('<M8[ns]'), 'column_d': dtype('float64'), } assert df.dtypes.to_dict() == expected_dtypes
def test_source_file2_has_partial_dtypes( source_file2_partial_dtypes: SourceFile): """ If for only a subset of the columns the dtypes were provided, those should be applied, and the other columns should have 'object' dtype. """ df = source_file2_partial_dtypes.get_csv_as_df(apply_dtypes=True) expected_dtypes = { 'column_a': dtype('O'), 'column_b': pd.Int64Dtype(), 'column_c': dtype('<M8[ns]'), 'column_d': dtype('O'), } assert df.dtypes.to_dict() == expected_dtypes
def test_source_file2_has_only_object_dtypes(source_file2: SourceFile): """When apply_dtypes=False, all columns should have object dtype.""" df = source_file2.get_csv_as_df(apply_dtypes=False) assert list(df.dtypes) == [dtype('O')] * 4
def source_file1_df(source_data_test_dir: Path) -> pd.DataFrame: """Get DataFrame of source_file1.csv without setting dtypes.""" file_path = source_data_test_dir / 'test_dir1' / 'source_file1.csv' params = get_file_params(delimiter=',') source_file = SourceFile(file_path, params) return source_file.get_csv_as_df(apply_dtypes=False)
def test_invalid_kwarg_raises_error(source_file2: SourceFile): with pytest.raises(TypeError) as excinfo: source_file2.get_csv_as_df(apply_dtypes=False, bad_kwarg=42) assert "unexpected keyword argument 'bad_kwarg'" in str(excinfo.value)
def test_setting_cached_df_manually(source_file2: SourceFile): df = source_file2.get_csv_as_df(apply_dtypes=False, cache=False) df.drop(labels='column_a', axis=1, inplace=True) source_file2.cache_df(df) df = source_file2.get_csv_as_df(apply_dtypes=False, cache=False) assert 'column_a' not in df.columns
def test_cache_method_is_called(source_file2: SourceFile): source_file2._cache_df_copy = MagicMock() source_file2.get_csv_as_df(apply_dtypes=False, cache=True) source_file2._cache_df_copy.assert_called_once()
def test_get_csv_as_df_requires_encoding(source_file2: SourceFile): del source_file2._params['encoding'] with pytest.raises(ValueError) as excinfo: source_file2.get_csv_as_df(apply_dtypes=False) assert "encoding" in str(excinfo.value)