def test_weekday(data): # Weekday, day=3,4,5 data1 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.one_of(st.integers(min_value=0, max_value=24), st.just(99)), unique=False), column(name='DAY_WEEK', elements=st.integers(min_value=3, max_value=5), unique=False) ])) # Weekday, day=6, hr=0-17, 24 data2 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.one_of(st.integers(min_value=0, max_value=17), st.just(24)), unique=False), column(name='DAY_WEEK', elements=st.just(6), unique=False) ])) # Weekday, day=2, hr=6-23 data3 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.integers(min_value=6, max_value=23), unique=False), column(name='DAY_WEEK', elements=st.just(2), unique=False) ])) for data in [data1, data2, data3]: for d in ei.day_of_week(data): assert d == 'Weekday'
def test_arbitrary_data_frames(data): columns = data.draw( st.lists( column_strategy(), unique_by=lambda c: c.name if c.name is not None else float("nan"), ) ) try: df = data.draw(pdst.data_frames(columns)) except Exception as e: if type(e).__name__ == "OutOfBoundsDatetime": # See https://github.com/HypothesisWorks/hypothesis-python/pull/826 reject() else: raise data_frame_columns = list(df) assert len(data_frame_columns) == len(columns) for i, (c, n) in enumerate(zip(columns, df)): if c.name is None: assert n == i else: assert c.name == n for i, c in enumerate(columns): column_name = data_frame_columns[i] values = df[column_name] if c.unique: assert len(set(values)) == len(values)
def test_arbitrary_data_frames(data): columns = data.draw( st.lists( column_strategy(), unique_by=lambda c: c.name if c.name is not None else float("nan"), ) ) try: # Use raw data to work around pandas bug in repr. See # https://github.com/pandas-dev/pandas/issues/27484 df = data.conjecture_data.draw(pdst.data_frames(columns)) except Exception as e: if type(e).__name__ == "OutOfBoundsDatetime": # See https://github.com/HypothesisWorks/hypothesis-python/pull/826 reject() else: raise data_frame_columns = list(df) assert len(data_frame_columns) == len(columns) for i, (c, n) in enumerate(zip(columns, df)): if c.name is None: assert n == i else: assert c.name == n for i, c in enumerate(columns): column_name = data_frame_columns[i] values = df[column_name] if c.unique: assert len(set(values)) == len(values)
def test_arbitrary_data_frames(data): columns = data.draw(st.lists( column_strategy(), unique_by=lambda c: c.name if c.name is not None else float('nan') )) try: df = data.draw(pdst.data_frames(columns)) except Exception as e: if type(e).__name__ == 'OutOfBoundsDatetime': # See https://github.com/HypothesisWorks/hypothesis-python/pull/826 reject() else: raise data_frame_columns = list(df) assert len(data_frame_columns) == len(columns) for i, (c, n) in enumerate(zip(columns, df)): if c.name is None: assert n == i else: assert c.name == n for i, c in enumerate(columns): column_name = data_frame_columns[i] values = df[column_name] if c.unique: assert len(set(values)) == len(values)
def null_dataframe_masks( draw, strategy: Optional[SearchStrategy], nullable_columns: Dict[str, bool], ): """Strategy for masking a values in a pandas DataFrame. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param nullable_columns: dictionary where keys are column names and values indicate whether that column is nullable. """ val = draw(strategy) size = val.shape[0] columns_strat = [] for name, nullable in nullable_columns.items(): element_st = st.booleans() if nullable else st.just(False) columns_strat.append( pdst.column( name=name, elements=element_st, dtype=bool, fill=st.just(False), ) ) mask_st = pdst.data_frames( columns=columns_strat, index=pdst.range_indexes(min_size=size, max_size=size), ) null_mask = draw(mask_st) # assume that there is at least one masked value hypothesis.assume(null_mask.any(axis=None)) return val.mask(null_mask)
def hypot_df_generator(): df = raw_() cols = [] for col in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q8', 'Q9']: cols.append( column(col, elements=strategies.sampled_from(df[col].unique()))) return data_frames(columns=cols)
def categoricaldf_strategy(): return data_frames( columns=[ column("names", st.sampled_from(names)), column("numbers", st.sampled_from(range(3))), ], index=range_indexes(min_size=1, max_size=20), )
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames([ pdst.column('A', dtype=int, unique=False), pdst.column('B', dtype=int, unique=True)], rows=st.tuples(st.integers(0, 10), st.integers(0, 10)), index=pdst.range_indexes(2, 2) ) find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
def nulldf_strategy(): return data_frames( columns=[ column("1", st.floats(allow_nan=True, allow_infinity=True)), column("2", st.sampled_from([np.nan])), column("3", st.sampled_from([np.nan])), ], index=range_indexes(min_size=3, max_size=20), )
def test_dayofweek_unknown(data): data1 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.integers(min_value=0, max_value=24), unique=False), column(name='DAY_WEEK', elements=st.just(9), unique=False) ])) data2 = data.draw( data_frames(columns=[ column(name='HOUR', elements=st.just(99), unique=False), column(name='DAY_WEEK', elements=st.one_of(st.just(2), st.just(6)), unique=False) ])) for data in [data1, data2]: for d in ei.day_of_week(data): assert d == 'Unknown'
def cmatrix_dataframes(): df = data_frames( columns=[s_column(), alf_column("X"), alf_column("Y"), bet_column("X"), bet_column("Y"), generic_column("R")], index=range_indexes(min_size=2, max_size=MAX_NRES) ) return df
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames( [ pdst.column("A", dtype=bool, unique=False), pdst.column("B", dtype=int, unique=True), ], rows=st.tuples(st.booleans(), st.integers(0, 10)), index=pdst.range_indexes(2, 2), ) find_any(data_frames, lambda x: x["A"][0] == x["A"][1])
def datasets(draw: Callable[[st.SearchStrategy], Any]) -> pd.DataFrame: """ Generates datasets of MOER values. """ frame = draw( hpd.data_frames( (hpd.column("timestamp", st.datetimes()), hpd.column("MOER", st.floats())))) frame.iloc[:, 0] = frame.iloc[:, 0].apply(padded_strftime) print(frame) return frame
class Test(unittest.TestCase): # Test fibonnaci sequence on first 10 integers @given(ST.integers(0, 10)) def test_00_fib_first_10(self, n): Math.fib(n) # Test fibonnaci sequence on negative numbers, # breaking one of the assumptions of the function # (no negative values) @given(ST.integers(max_value=-1)) def test_01_fib_negative_values(self, n): Math.fib(n) # Test BetaCoefficient functionality @given(n=ST.integers(), Y=ST.lists(ST.floats()), X1=ST.lists(ST.floats())) def test_03_BetaCoefficient(self, n, Y, X1): beta = Math.GetBetaCoefficient(Y, X1) print(beta) # Test unbound plane with 1000 different examples @settings(max_examples=1000) @given(coord1=ST.tuples(ST.floats(), ST.floats()), coord2=ST.tuples(ST.floats(), ST.floats())) def test_04_EuclideanDistance_unbound(self, coord1, coord2): Math.EuclideanDistance(coord1, coord2) # Test simple pandas transpose @given(data_frames([ column('a', dtype=int), column('b', dtype=int), ])) def test_05_transpose(self, df): transforms.transpose(df) # Test the creation of a geographic distance matrix # Building on euclidean distance, let's test a higher order function. @given( data_frames([column('lat', dtype=float), column('lon', dtype=float)])) def test_06_DistanceMatrixGeneration(self, df): df['store_id'] = [_ for _ in range(len(df))] transforms.CreateDistanceMatrix(df)
def data_generator(draw): df = draw( data_frames(columns=columns( names_or_number=[str(i) for i in range(10)], dtype=float, elements=st.floats(allow_infinity=False, max_value=1e+30)))) reg_l1 = draw(st.floats()) reg_l2 = draw(st.floats()) optimizator = draw(st.sampled_from(['L-BFGS-B', 'BFGS'])) intercept = draw(st.booleans()) return df, reg_l1, reg_l2, optimizator, intercept
class TestMain(unittest.TestCase): @given(df=data_frames( [column('1', dtype='float'), column('2', dtype='float')])) @patch("pandas.read_csv") def test_sum_basic(self, read_csv_mock: Mock, df): read_csv_mock.return_value = df results = sum() read_csv_mock.assert_called_once() self.assertEqual(len(results.columns), 3) self.assertEqual(results['3'].dtype, 'float')
def full_dataframes(): df = data_frames( columns=[s_column(), bet_column("X"), bet_column("Y"), mu_column("X"), mu_column("Y"), d_column("X"), d_column("Y"), generic_column("K0L"), generic_column("K0SL"), generic_column("K1L"), generic_column("K1SL"), generic_column("K2L"), generic_column("K2SL"), generic_column("K3L"), generic_column("K3SL")], index=range_indexes(min_size=2, max_size=MAX_NRES) ) return df
def test_expected_failure_from_omitted_object_dtype(dtype): # See https://github.com/HypothesisWorks/hypothesis/issues/3133 col = pdst.column(elements=st.sets(st.text(), min_size=1), dtype=dtype) @given(pdst.data_frames(columns=[col])) def works_with_object_dtype(df): pass if dtype is object: works_with_object_dtype() else: assert dtype is None with pytest.raises(ValueError, match="Maybe passing dtype=object would help"): works_with_object_dtype()
def test_pandas_vertex_creation_noproperty(self): dataframe = data_frames( columns=[column(name='src', elements=st.sampled_from(names), unique=True), column(name='age', elements=st.integers(min_value=20, max_value=30), unique=False), ] ).example() g = TinkerFactory().addV_from_pandas(dataframe, src='src', v_properties = [])
def multiindex_strategy( pandera_dtype: Optional[DataType] = None, strategy: Optional[SearchStrategy] = None, *, indexes: Optional[List] = None, size: Optional[int] = None, ): """Strategy to generate a pandas MultiIndex object. :param pandera_dtype: :class:`pandera.dtypes.DataType` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param indexes: a list of :class:`~pandera.schema_components.Index` objects. :param size: number of elements in the Series. :returns: ``hypothesis`` strategy. """ # pylint: disable=unnecessary-lambda if strategy: raise BaseStrategyOnlyError( "The dataframe strategy is a base strategy. You cannot specify " "the strategy argument to chain it to a parent strategy." ) indexes = [] if indexes is None else indexes index_dtypes = { index.name if index.name is not None else i: str(index.dtype) for i, index in enumerate(indexes) } nullable_index = { index.name if index.name is not None else i: index.nullable for i, index in enumerate(indexes) } strategy = pdst.data_frames( [index.strategy_component() for index in indexes], index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), ).map(lambda x: x.astype(index_dtypes)) # this is a hack to convert np.str_ data values into native python str. for name, dtype in index_dtypes.items(): if dtype in {"object", "str"} or dtype.startswith("string"): # pylint: disable=cell-var-from-loop,undefined-loop-variable strategy = strategy.map( lambda df: df.assign(**{name: df[name].map(str)}) ) if any(nullable_index.values()): strategy = null_dataframe_masks(strategy, nullable_index) return strategy.map(pd.MultiIndex.from_frame)
def dataframe(draw): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists(sampled_from([float, int, str]), min_size=n_cols, max_size=n_cols)) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) return draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ]))
def gen_columns_and_subset(draw, elements=names): column_names = draw(lists(elements, min_size=1, unique=True)) num_columns_to_keep = draw( integers(min_value=1, max_value=len(column_names))) i = num_columns_to_keep columns_to_keep = set() while i > 0: keeper_column = draw( integers(min_value=0, max_value=len(column_names) - 1)) columns_to_keep.add(column_names[keeper_column]) i = i - 1 # With column data and 'keeper' columns selected, utilize draw to return # a hypothesis DataFrame column strategies defined. return draw( hpd.data_frames(hpd.columns(column_names, elements=elements), index=hpd.range_indexes(min_size=5))), columns_to_keep
def test_pandas_vertex_creation_noproperty(self): names = ['andre','renan','diego','caio','victor','bruno'] languages = ['python','R','java'] dataframe = data_frames( columns=[column(name='src', elements=st.sampled_from(names), unique=True), column(name='age', elements=st.integers(min_value=20, max_value=30), unique=False), column(name='lang', elements=st.sampled_from(languages), unique=False) ] ).example() g = TinkerFactory().addV_from_pandas(dataframe, src='src', v_properties = ['age'])
def df_strategy(allow_nan=True, allow_infinity=True): """ This strategies generates dataframes that might containing a column without null/inf and a column with inf and possible nan values. """ return data_frames( columns=[ column(name="item", dtype=float), column(name="att1", dtype="object"), column(name="att2", dtype=float), ], rows=st.tuples( st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity), st.text(printable, max_size=5), st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity), ), )
def test_can_minimize_based_on_two_columns_independently( disable_fill, non_standard_index): columns = [ pdst.column(name, dtype=bool, fill=st.nothing() if name in disable_fill else None) for name in ["A", "B", "C"] ] x = minimal( pdst.data_frames( columns, index=pdst.indexes(dtype=int) if non_standard_index else None), lambda x: x["A"].any() and x["B"].any() and x["C"].any(), random=Random(0), ) assert len(x["A"]) == 1 assert x["A"][0] == 1 assert x["B"][0] == 1 assert x["C"][0] == 1
def multiindex_strategy( pandas_dtype: Optional[PandasDtype] = None, strategy: Optional[SearchStrategy] = None, *, indexes: Optional[List] = None, size: Optional[int] = None, ): """Strategy to generate a pandas MultiIndex object. :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param indexes: a list of :class:`~pandera.schema_components.Inded` objects. :param size: number of elements in the Series. :returns: ``hypothesis`` strategy. """ # pylint: disable=unnecessary-lambda if strategy: raise BaseStrategyOnlyError( "The dataframe strategy is a base strategy. You cannot specify " "the strategy argument to chain it to a parent strategy." ) indexes = [] if indexes is None else indexes index_dtypes = { index.name if index.name is not None else i: index.dtype for i, index in enumerate(indexes) } nullable_index = { index.name if index.name is not None else i: index.nullable for i, index in enumerate(indexes) } strategy = pdst.data_frames( [index.strategy_component() for index in indexes], index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), ).map(lambda x: x.astype(index_dtypes)) if any(nullable_index.values()): strategy = null_dataframe_masks(strategy, nullable_index) return strategy.map(pd.MultiIndex.from_frame)
def df_strategy(): """ A convenience function for generating a dataframe as a hypothesis strategy. Should be treated like a fixture, but should not be passed as a fixture into a test function. Instead:: @given(df=dataframe()) def test_function(df): # test goes here """ return data_frames( columns=[ column("a", elements=st.integers()), column("Bell__Chart", elements=st.floats()), column("decorated-elephant", elements=st.integers()), column("animals@#$%^", elements=st.text()), column("cities", st.text()), ], index=range_indexes(min_size=1, max_size=20), )
def dataframe(draw): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists( one_of( np_strategies.floating_dtypes(), np_strategies.integer_dtypes(), np_strategies.unicode_string_dtypes(), ), min_size=n_cols, max_size=n_cols, )) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) return draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ]))
def test_can_minimize_based_on_two_columns_independently( disable_fill, non_standard_index ): columns = [ pdst.column( name, dtype=bool, fill=st.nothing() if name in disable_fill else None, ) for name in ['A', 'B', 'C'] ] x = minimal( pdst.data_frames( columns, index=pdst.indexes(dtype=int) if non_standard_index else None, ), lambda x: x['A'].any() and x['B'].any() and x['C'].any(), random=Random(0), ) assert len(x['A']) == 1 assert x['A'][0] == 1 assert x['B'][0] == 1 assert x['C'][0] == 1
def dataframe_and_clusters(draw, length=None): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists(sampled_from([float, int, str]), min_size=n_cols, max_size=n_cols)) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) df = draw( data_frames(columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ])) cluster_labels = draw( lists( integers(min_value=0, max_value=3), min_size=len(df), max_size=len(df), )) return df, cluster_labels
class TestExogenous: @given(giotto_time_series(min_length=2)) def test_exogenous_single_column(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) transformed_time_series.columns = ["time_series"] assert_frame_equal(transformed_time_series, time_series, check_names=False) @given(data_frames([column("A", dtype=int), column("B", dtype=float)])) def test_multiple_columns(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) transformed_time_series.columns = ["A", "B"] assert_frame_equal(transformed_time_series, time_series, check_names=False) @given(giotto_time_series(min_length=2)) def test_naming(self, time_series: pd.DataFrame): exogenous = Exogenous() transformed_time_series = exogenous.fit_transform(time_series) expected_columns = [ f"{column_name}__Exogenous" for column_name in time_series.columns ] assert expected_columns == list(transformed_time_series.columns)
from __future__ import division, print_function, absolute_import import numpy as np import pytest import hypothesis.strategies as st import hypothesis.extra.numpy as npst import hypothesis.extra.pandas as pdst from hypothesis import HealthCheck, given, reject, settings from hypothesis.types import RandomWithSeed as Random from tests.common.debug import minimal, find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([ pdst.column('a', dtype=int), pdst.column('b', dtype=float), ])) def test_can_have_columns_of_distinct_types(df): assert df['a'].dtype == np.dtype(int) assert df['b'].dtype == np.dtype(float) @given(pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5))) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float))) def test_can_specify_just_column_names(df):
variable_names.extend([ 'S2e_X', 'S2e_Y', 'S2e_Z', 'S2e_R', 'S2e_Phi', 'S2q_X', 'S2q_Y', 'S2q_Z', 'S2q_R', 'S2q_Phi', 'XY', 'S2e_XY', 'S2q_XY' ]) for k in out_bins: assert k in variable_names kdst_variables = [ 'nS2', 'S1w', 'S1h', 'S1e', 'S1t', 'S2w', 'S2h', 'S2e', 'S2q', 'S2t', 'Nsipm', 'DT', 'Z', 'X', 'Y', 'R', 'Phi', 'Zrms', 'Xrms', 'Yrms' ] @given( data_frames( columns=columns(kdst_variables, elements=floats(allow_nan=False)))) @settings(deadline=None) def test_fill_kdst_var_1d(kdst): var_dict = defaultdict(list) monf.fill_kdst_var_1d(kdst, var_dict) for var in var_dict: value = kdst[var].values if var in ['S1t', 'S2t', 'S1w']: value = value / units.mus assert np.allclose(value, var_dict[var]) @given( data_frames( columns=columns(kdst_variables, elements=floats(allow_nan=False))))
# # END HEADER from __future__ import absolute_import, division, print_function import numpy as np import hypothesis.extra.numpy as npst import hypothesis.extra.pandas as pdst import hypothesis.strategies as st from hypothesis import HealthCheck, given, reject, settings from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)])) def test_can_have_columns_of_distinct_types(df): assert df["a"].dtype == np.dtype(int) assert df["b"].dtype == np.dtype(float) @given( pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5) ) ) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float)))