def test_stratified_sampling_fit_and_sample_n_samples_approx_limit( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() col_name = "col1" df_treatment = pd.DataFrame([{ "id": f"id_{x}", col_name: x } for x in (list(np.arange(0, 2, 0.1)) + list(np.arange(2, 4, 0.5)) + list(np.arange(4, 6, 1)) + list(np.arange(6, 10, 0.2)))]) df_pool = pd.DataFrame([{ "id": f"id_{x}", col_name: x } for x in np.arange(0, 20, 0.01)]) stratified_sampling_obj.add_column(col_name) n_samples_approx = 40 stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=n_samples_approx, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) output = stratified_sampling_obj.data_sample.df assert output["_bin_label"].nunique() == 2 bins_df = stratified_sampling_obj.diagnostics().count_bins() assert abs(len(output) - n_samples_approx) <= 1
def diagnostics_obj(df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=4) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=len(df_treatment), random_seed=1 ) return stratified_sampling_obj.diagnostics()
def test_stratified_sampling_fit_and_sample_too_many_bins( df_treatment, df_pool, col_name): df_treatment["col2"] = df_treatment[col_name].astype(int) df_pool["col2"] = df_pool[col_name].astype(int) df_treatment["col3"] = df_treatment[col_name].astype(int) * 2 df_pool["col3"] = df_pool[col_name].astype(int) / 2 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") ## attempting to estimate both n_bins and n_samples with pytest.raises(ValueError): stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1)
def test_stratified_sampling_fit_and_sample_n_samples_approx_limit( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) n_samples_approx = 40 stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, n_samples_approx=n_samples_approx, random_seed=1) output = stratified_sampling_obj.data_sample.df assert output["_bin_label"].nunique() == 2 bins_df = stratified_sampling_obj.diagnostics().count_bins() assert (bins_df["n_sampled"] / bins_df["n_pct_sampled"]).round() == n_samples_approx
def test_stratified_sampling_fit_and_sample_dont_require_equivalence( df_treatment, df_pool, col_name): df_treatment["col2"] = df_treatment[col_name].astype(int) df_pool["col2"] = df_pool[col_name].astype(int) df_treatment["col3"] = df_treatment[col_name].astype(int) * 2 df_pool["col3"] = df_pool[col_name].astype(int) / 2 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3", auto_bin_require_equivalence=False) ## attempting to estimate both n_bins and n_samples stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert not output.empty
def test_stratified_sampling_fit_and_sample_upper_limit_n_samples_approx( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples with pytest.raises(ModelSamplingException): stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1, n_samples_approx=1000) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, random_seed=1, n_samples_approx=1000, relax_n_samples_approx_constraint=True, ) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert not output.empty
def test_stratified_sampling_fit_and_sample(): stratified_sampling_obj = StratifiedSampling() df_treatment = pd.DataFrame([{ "id": f"id_{x}", "col1": x } for x in range(0, 10)]) df_pool = pd.DataFrame([{ "id": f"id_{x}", "col1": x / 2.0 } for x in range(0, 1000)]) stratified_sampling_obj.add_column("col1") stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) == set(sample2) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=5, min_n_sampled_to_n_treatment_ratio=None, ) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) != set(sample2)
def test_stratified_sampling_fit_and_sample_n_samples_approx_variations( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert len(bins_df) == 3 ## enforcing 1 bin stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=1) stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() ## enforcing 4 bins stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=4) stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert len(bins_df) == 4 ## enforcing n_samples_approx=40 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=40, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() # should be within 1 of n_samples_approx assert abs(len(output) - 40) <= 1