def categoricaldf_strategy(): return data_frames( columns=[ column("names", st.sampled_from(names)), column("numbers", st.sampled_from(range(3))), ], index=range_indexes(min_size=1, max_size=20), )
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames([ pdst.column('A', dtype=int, unique=False), pdst.column('B', dtype=int, unique=True)], rows=st.tuples(st.integers(0, 10), st.integers(0, 10)), index=pdst.range_indexes(2, 2) ) find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
def nulldf_strategy(): return data_frames( columns=[ column("1", st.floats(allow_nan=True, allow_infinity=True)), column("2", st.sampled_from([np.nan])), column("3", st.sampled_from([np.nan])), ], index=range_indexes(min_size=3, max_size=20), )
def datasets(draw: Callable[[st.SearchStrategy], Any]) -> pd.DataFrame: """ Generates datasets of MOER values. """ frame = draw( hpd.data_frames( (hpd.column("timestamp", st.datetimes()), hpd.column("MOER", st.floats())))) frame.iloc[:, 0] = frame.iloc[:, 0].apply(padded_strftime) print(frame) return frame
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames( [ pdst.column("A", dtype=bool, unique=False), pdst.column("B", dtype=int, unique=True), ], rows=st.tuples(st.booleans(), st.integers(0, 10)), index=pdst.range_indexes(2, 2), ) find_any(data_frames, lambda x: x["A"][0] == x["A"][1])
class TestMain(unittest.TestCase): @given(df=data_frames( [column('1', dtype='float'), column('2', dtype='float')])) @patch("pandas.read_csv") def test_sum_basic(self, read_csv_mock: Mock, df): read_csv_mock.return_value = df results = sum() read_csv_mock.assert_called_once() self.assertEqual(len(results.columns), 3) self.assertEqual(results['3'].dtype, 'float')
def test_pandas_vertex_creation_noproperty(self): dataframe = data_frames( columns=[column(name='src', elements=st.sampled_from(names), unique=True), column(name='age', elements=st.integers(min_value=20, max_value=30), unique=False), ] ).example() g = TinkerFactory().addV_from_pandas(dataframe, src='src', v_properties = [])
def hypot_df_generator(): df = raw_() cols = [] for col in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q8', 'Q9']: cols.append( column(col, elements=strategies.sampled_from(df[col].unique()))) return data_frames(columns=cols)
def column_strategy( pandas_dtype: PandasDtype, strategy: Optional[SearchStrategy] = None, *, checks: Optional[Sequence] = None, allow_duplicates: Optional[bool] = True, name: Optional[str] = None, ): # pylint: disable=line-too-long """Create a data object describing a column in a DataFrame. :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param checks: sequence of :class:`~pandera.checks.Check` s to constrain the values of the data in the column/index. :param allow_duplicates: whether or not generated Series contains duplicates. :param name: name of the Series. :returns: a `column <https://hypothesis.readthedocs.io/en/latest/numpy.html#hypothesis.extra.pandas.column>`_ object. """ verify_pandas_dtype(pandas_dtype, schema_type="column", name=name) elements = field_element_strategy(pandas_dtype, strategy, checks=checks) return pdst.column( name=name, elements=elements, dtype=pandas_dtype.numpy_dtype, unique=not allow_duplicates, )
def null_dataframe_masks( draw, strategy: Optional[SearchStrategy], nullable_columns: Dict[str, bool], ): """Strategy for masking a values in a pandas DataFrame. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param nullable_columns: dictionary where keys are column names and values indicate whether that column is nullable. """ val = draw(strategy) size = val.shape[0] columns_strat = [] for name, nullable in nullable_columns.items(): element_st = st.booleans() if nullable else st.just(False) columns_strat.append( pdst.column( name=name, elements=element_st, dtype=bool, fill=st.just(False), ) ) mask_st = pdst.data_frames( columns=columns_strat, index=pdst.range_indexes(min_size=size, max_size=size), ) null_mask = draw(mask_st) # assume that there is at least one masked value hypothesis.assume(null_mask.any(axis=None)) return val.mask(null_mask)
def s_column(): return column("S", elements=floats(min_value=0, max_value=27000, allow_infinity=False, allow_nan=False), dtype=np.float64)
def column_by_dtypes( dtype_group: Optional[str] = "RiptableNumeric") -> List[pdst.column]: """Returns a list of columns from a dtype group for generation of primitive data types wrapped in columns for DataFrame strategies.""" return [ pdst.column(str(dtype), dtype=np.dtype(dtype)) for dtype in set(dtypes_by_group[dtype_group]) ]
def test_dayofweek_unknown(data): data1 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.integers(min_value=0, max_value=24), unique=False), column(name='DAY_WEEK', elements=st.just(9), unique=False) ])) data2 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.just(99), unique=False), column(name='DAY_WEEK', elements=st.one_of(st.just(2), st.just(6)), unique=False) ])) for data in [data1, data2]: for d in ei.day_of_week(data): assert d == 'Unknown'
def test_pandas_vertex_creation_noproperty(self): names = ['andre','renan','diego','caio','victor','bruno'] languages = ['python','R','java'] dataframe = data_frames( columns=[column(name='src', elements=st.sampled_from(names), unique=True), column(name='age', elements=st.integers(min_value=20, max_value=30), unique=False), column(name='lang', elements=st.sampled_from(languages), unique=False) ] ).example() g = TinkerFactory().addV_from_pandas(dataframe, src='src', v_properties = ['age'])
def df_strategy(allow_nan=True, allow_infinity=True): """ This strategies generates dataframes that might containing a column without null/inf and a column with inf and possible nan values. """ return data_frames( columns=[ column(name="item", dtype=float), column(name="att1", dtype="object"), column(name="att2", dtype=float), ], rows=st.tuples( st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity), st.text(printable, max_size=5), st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity), ), )
class Test(unittest.TestCase): # Test fibonnaci sequence on first 10 integers @given(ST.integers(0, 10)) def test_00_fib_first_10(self, n): Math.fib(n) # Test fibonnaci sequence on negative numbers, # breaking one of the assumptions of the function # (no negative values) @given(ST.integers(max_value=-1)) def test_01_fib_negative_values(self, n): Math.fib(n) # Test BetaCoefficient functionality @given(n=ST.integers(), Y=ST.lists(ST.floats()), X1=ST.lists(ST.floats())) def test_03_BetaCoefficient(self, n, Y, X1): beta = Math.GetBetaCoefficient(Y, X1) print(beta) # Test unbound plane with 1000 different examples @settings(max_examples=1000) @given(coord1=ST.tuples(ST.floats(), ST.floats()), coord2=ST.tuples(ST.floats(), ST.floats())) def test_04_EuclideanDistance_unbound(self, coord1, coord2): Math.EuclideanDistance(coord1, coord2) # Test simple pandas transpose @given(data_frames([ column('a', dtype=int), column('b', dtype=int), ])) def test_05_transpose(self, df): transforms.transpose(df) # Test the creation of a geographic distance matrix # Building on euclidean distance, let's test a higher order function. @given( data_frames([column('lat', dtype=float), column('lon', dtype=float)])) def test_06_DistanceMatrixGeneration(self, df): df['store_id'] = [_ for _ in range(len(df))] transforms.CreateDistanceMatrix(df)
def column_arrays(draw) -> List[Union[np.ndarray, rt.FastArray]]: """Returns a list of numpy ndarray and riptide FastArray wrapped in columns for DataFrame strategies.""" # todo add strategy to generate FastArray to the return list arr = draw( generate_array( shape=ndarray_shape_strategy(), dtype=ints_or_floats_dtypes(), include_invalid=False, )) # f_arr = rt.FastArray(arr) return [pdst.column(name(arr), elements=arr)]
def df_strategy(): """ A convenience function for generating a dataframe as a hypothesis strategy. Should be treated like a fixture, but should not be passed as a fixture into a test function. Instead:: @given(df=dataframe()) def test_function(df): # test goes here """ return data_frames( columns=[ column("a", elements=st.integers()), column("Bell__Chart", elements=st.floats()), column("decorated-elephant", elements=st.integers()), column("animals@#$%^", elements=st.text()), column("cities", st.text()), ], index=range_indexes(min_size=1, max_size=20), )
def test_weekday(data): # Weekday, day=3,4,5 data1 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.one_of(st.integers(min_value=0, max_value=24), st.just(99)), unique=False), column(name='DAY_WEEK', elements=st.integers(min_value=3, max_value=5), unique=False) ])) # Weekday, day=6, hr=0-17, 24 data2 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.one_of(st.integers(min_value=0, max_value=17), st.just(24)), unique=False), column(name='DAY_WEEK', elements=st.just(6), unique=False) ])) # Weekday, day=2, hr=6-23 data3 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.integers(min_value=6, max_value=23), unique=False), column(name='DAY_WEEK', elements=st.just(2), unique=False) ])) for data in [data1, data2, data3]: for d in ei.day_of_week(data): assert d == 'Weekday'
def test_expected_failure_from_omitted_object_dtype(dtype): # See https://github.com/HypothesisWorks/hypothesis/issues/3133 col = pdst.column(elements=st.sets(st.text(), min_size=1), dtype=dtype) @given(pdst.data_frames(columns=[col])) def works_with_object_dtype(df): pass if dtype is object: works_with_object_dtype() else: assert dtype is None with pytest.raises(ValueError, match="Maybe passing dtype=object would help"): works_with_object_dtype()
class TestExogenous: @given(giotto_time_series(min_length=2)) def test_exogenous_single_column(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) transformed_time_series.columns = ["time_series"] assert_frame_equal(transformed_time_series, time_series, check_names=False) @given(data_frames([column("A", dtype=int), column("B", dtype=float)])) def test_multiple_columns(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) transformed_time_series.columns = ["A", "B"] assert_frame_equal(transformed_time_series, time_series, check_names=False) @given(giotto_time_series(min_length=2)) def test_naming(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) expected_columns = [ f"{column_name}__Exogenous" for column_name in time_series.columns ] assert expected_columns == list(transformed_time_series.columns)
def dataframe(draw): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists(sampled_from([float, int, str]), min_size=n_cols, max_size=n_cols)) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) return draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ]))
def column_strategy(draw): name = draw(st.none() | st.text()) dtype = draw(npst.scalar_dtypes().filter(supported_by_pandas)) pass_dtype = not draw(st.booleans()) if pass_dtype: pass_elements = not draw(st.booleans()) else: pass_elements = True if pass_elements: elements = npst.from_dtype(dtype) else: elements = None unique = draw(st.booleans()) fill = st.nothing() if draw(st.booleans()) else None return pdst.column( name=name, dtype=dtype, unique=unique, fill=fill, elements=elements)
def test_can_minimize_based_on_two_columns_independently( disable_fill, non_standard_index): columns = [ pdst.column(name, dtype=bool, fill=st.nothing() if name in disable_fill else None) for name in ["A", "B", "C"] ] x = minimal( pdst.data_frames( columns, index=pdst.indexes(dtype=int) if non_standard_index else None), lambda x: x["A"].any() and x["B"].any() and x["C"].any(), random=Random(0), ) assert len(x["A"]) == 1 assert x["A"][0] == 1 assert x["B"][0] == 1 assert x["C"][0] == 1
def dataframe(draw): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists( one_of( np_strategies.floating_dtypes(), np_strategies.integer_dtypes(), np_strategies.unicode_string_dtypes(), ), min_size=n_cols, max_size=n_cols, )) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) return draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ]))
def dataframe_and_clusters(draw, length=None): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists(sampled_from([float, int, str]), min_size=n_cols, max_size=n_cols)) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) df = draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ])) cluster_labels = draw( lists( integers(min_value=0, max_value=3), min_size=len(df), max_size=len(df), )) return df, cluster_labels
def test_can_minimize_based_on_two_columns_independently( disable_fill, non_standard_index ): columns = [ pdst.column( name, dtype=bool, fill=st.nothing() if name in disable_fill else None, ) for name in ['A', 'B', 'C'] ] x = minimal( pdst.data_frames( columns, index=pdst.indexes(dtype=int) if non_standard_index else None, ), lambda x: x['A'].any() and x['B'].any() and x['C'].any(), random=Random(0), ) assert len(x['A']) == 1 assert x['A'][0] == 1 assert x['B'][0] == 1 assert x['C'][0] == 1
# # END HEADER from __future__ import absolute_import, division, print_function import numpy as np import hypothesis.extra.numpy as npst import hypothesis.extra.pandas as pdst import hypothesis.strategies as st from hypothesis import HealthCheck, given, reject, settings from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)])) def test_can_have_columns_of_distinct_types(df): assert df["a"].dtype == np.dtype(int) assert df["b"].dtype == np.dtype(float) @given( pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5) ) ) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float)))
import pytest from siuba import _, mutate, group_by, summarize, filter import siuba.sql.dply from siuba.dply import vector as v from datetime import timedelta from hypothesis import given, settings, example from hypothesis.strategies import text, floats, integers from hypothesis.extra.pandas import data_frames, column, indexes from .helpers import assert_equal_query, data_frame, backend_sql from pandas.testing import assert_frame_equal DATA_SPEC = data_frames([ column('x', elements=floats(width=32) | integers(), unique=True), column('g', dtype=str, elements=text(max_size=1)) ], index=indexes(elements=floats() | integers(), max_size=10)) OMNIBUS_VECTOR_FUNCS = [ #cumall, cumany, cummean, #desc, v.dense_rank(_.x, na_option="keep"), #v.percent_rank(_.x), v.min_rank(_.x, na_option="keep"), v.cume_dist(_.x, na_option="keep"), v.row_number(_.x), #ntile, v.between(_.x, 2, 5, default=False),
def generic_column_pos(name): return column(name, elements=floats(min_value=0, allow_infinity=False, allow_nan=False), dtype=np.float64)
from __future__ import division, print_function, absolute_import import numpy as np import pytest import hypothesis.strategies as st import hypothesis.extra.numpy as npst import hypothesis.extra.pandas as pdst from hypothesis import HealthCheck, given, reject, settings from hypothesis.types import RandomWithSeed as Random from tests.common.debug import minimal, find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([ pdst.column('a', dtype=int), pdst.column('b', dtype=float), ])) def test_can_have_columns_of_distinct_types(df): assert df['a'].dtype == np.dtype(int) assert df['b'].dtype == np.dtype(float) @given(pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5))) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float)))
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None) def test_shap(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20) def test_shap_interactions(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
strands = st.sampled_from("+ -".split()) single_strand = st.sampled_from(["+"]) names = st.text("abcdefghijklmnopqrstuvxyz", min_size=1) scores = st.integers(min_value=0, max_value=256) datatype = st.sampled_from([pd.Series, np.array, list]) chromosomes = st.sampled_from( ["chr{}".format(str(e)) for e in list(range(1, 23)) + "X Y M".split()]) chromosomes_small = st.sampled_from(["chr1"]) cs = st.one_of(chromosomes, chromosomes_small) runlengths = data_frames( index=indexes(dtype=np.int64, min_size=1, unique=True), columns=[ column("Runs", st.integers(min_value=1, max_value=int(1e7))), # must have a min/max on floats because R S4vectors translates too big ones into inf. # which is unequal to eg -1.79769e+308 so the tests fail column("Values", st.integers(min_value=-int(1e7), max_value=int(1e7))) ]) better_dfs_no_min = data_frames( index=indexes(dtype=np.int64, min_size=0, unique=True, elements=lengths), columns=[ column("Chromosome", cs), column("Start", elements=lengths), column("End", elements=small_lengths), # column("Name", elements=names), # column("Score", elements=scores), column("Strand", strands) ])
def alf_column(plane): return column("ALF" + plane, elements=floats(allow_infinity=False, allow_nan=False), dtype=np.float64)
def bet_column(plane): return column("BET" + plane, elements=floats(min_value=1e-7, allow_infinity=False, allow_nan=False), dtype=np.float64)
text(), primitive_strategy) | lists(primitive_strategy) nested_strategy = recursive( container_strategy, lambda children: lists(children) | dictionaries(text(), children), ) numpy_strategy = arrays(guaranteed_dtypes, array_shapes()) pandas_series = series(dtype=int) | series(dtype=float) | series(dtype=str) pandas_dfs = (data_frames(columns(3, dtype=int)) | data_frames(columns(3, dtype=float)) | data_frames(columns(3, dtype=str)) | data_frames( [column(dtype=str), column(dtype=float), column(dtype=int)])) possible_input_data = one_of( lists(primitive_strategy), numpy_strategy, pandas_series, # pandas_dfs ) TEST_DF = pd.DataFrame(np.meshgrid(np.arange(20), np.arange(20))[0]) TEST_SERIES = pd.Series(np.arange(20)) TEST_ARRAY = np.arange(20)