def test_stratified_sampling_fit_and_sample_n_samples_approx_limit( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() col_name = "col1" df_treatment = pd.DataFrame([{ "id": f"id_{x}", col_name: x } for x in (list(np.arange(0, 2, 0.1)) + list(np.arange(2, 4, 0.5)) + list(np.arange(4, 6, 1)) + list(np.arange(6, 10, 0.2)))]) df_pool = pd.DataFrame([{ "id": f"id_{x}", col_name: x } for x in np.arange(0, 20, 0.01)]) stratified_sampling_obj.add_column(col_name) n_samples_approx = 40 stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=n_samples_approx, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) output = stratified_sampling_obj.data_sample.df assert output["_bin_label"].nunique() == 2 bins_df = stratified_sampling_obj.diagnostics().count_bins() assert abs(len(output) - n_samples_approx) <= 1
def test_plot_records_based_equiv_average_chisquare( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") bin_selection = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) bin_selection.plot_records_based_equiv_average(plot=False) results = bin_selection.results_as_json() assert 'bins_selected_str' in list(results['n_bin_results'][0].keys())
def test_plot_records_based_equiv_average(df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") bin_selection = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='euclidean', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) bin_selection.plot_records_based_equiv_average(plot=False) bin_selection.results_as_json()
def diagnostics_obj(df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=4) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=len(df_treatment), random_seed=1 ) return stratified_sampling_obj.diagnostics()
def test_stratified_sampling_fit_and_sample_random_seed_check(): # perturb was returning different values since it was writing over the # df rather than using a copy df_comparison = pd.DataFrame([{ "id": f"id-{x}", "baseline_annual_kwh": np.random.random() * 10000, "baseline_bd_pct_heating_load": np.random.random(), } for x in range(0, 200000)]) df_treatment = pd.DataFrame([{ "id": f"id-{x}", "baseline_annual_kwh": np.random.random() * 10000, "baseline_bd_pct_heating_load": np.random.random(), } for x in range(0, 500)]) n_samples_approx = 500 random_seed = 1 stratification_params = [ "baseline_annual_kwh", "baseline_bd_pct_heating_load" ] model = StratifiedSampling(treatment_label="treatment", pool_label="comparison", output_name="control") [model.add_column(col) for col in stratification_params] model.fit(df_treatment, min_n_treatment_per_bin=0) model.sample(df_comparison, n_samples_approx=n_samples_approx, random_seed=random_seed) for run_num in range(0, 10): model_temp = StratifiedSampling(treatment_label="treatment", pool_label="comparison", output_name="control") [model_temp.add_column(col) for col in stratification_params] model_temp.fit(df_treatment, min_n_treatment_per_bin=0) model_temp.sample(df_comparison, n_samples_approx=n_samples_approx, random_seed=random_seed) pd.testing.assert_frame_equal( model_temp.data_sample.df[stratification_params + ["id"]], model.data_sample.df[stratification_params + ["id"]], ) assert (len( set(model_temp.data_sample.df["id"].values) - set(model.data_sample.df["id"].values)) == 0)
def test_stratified_sampling_fit_and_sample(): stratified_sampling_obj = StratifiedSampling() df_treatment = pd.DataFrame([{ "id": f"id_{x}", "col1": x } for x in range(0, 10)]) df_pool = pd.DataFrame([{ "id": f"id_{x}", "col1": x / 2.0 } for x in range(0, 1000)]) stratified_sampling_obj.add_column("col1") stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) == set(sample2) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=10, random_seed=5, min_n_sampled_to_n_treatment_ratio=None, ) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) != set(sample2)
def test_stratified_sampling_fit_and_sample_too_many_bins( df_treatment, df_pool, col_name): df_treatment["col2"] = df_treatment[col_name].astype(int) df_pool["col2"] = df_pool[col_name].astype(int) df_treatment["col3"] = df_treatment[col_name].astype(int) * 2 df_pool["col3"] = df_pool[col_name].astype(int) / 2 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") ## attempting to estimate both n_bins and n_samples with pytest.raises(ValueError): stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1)
def test_stratified_sampling_fit_and_sample_n_samples_approx_limit( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) n_samples_approx = 40 stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, n_samples_approx=n_samples_approx, random_seed=1) output = stratified_sampling_obj.data_sample.df assert output["_bin_label"].nunique() == 2 bins_df = stratified_sampling_obj.diagnostics().count_bins() assert (bins_df["n_sampled"] / bins_df["n_pct_sampled"]).round() == n_samples_approx
def test_stratified_sampling_fit_and_sample_records_equivalence_too_many_bins( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples with pytest.raises(ModelSamplingException): model_w_selected_bins = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=1000, max_n_bins=1002, random_seed=1, equivalence_method='chisquare', relax_n_samples_approx_constraint=False, equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix)
def test_stratified_sampling_fit_and_sample_upper_limit_n_samples_approx( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples with pytest.raises(ModelSamplingException): stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1, n_samples_approx=1000) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, random_seed=1, n_samples_approx=1000, relax_n_samples_approx_constraint=True, ) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert not output.empty
def test_stratified_sampling_fit_and_sample_records_equivalence( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): stratified_sampling_obj = StratifiedSampling() df_pool["col2"] = df_pool[col_name] df_treatment["col2"] = df_treatment[col_name] stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") ## attempting to estimate both n_bins and n_samples StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=4, max_n_bins=6, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins()
def test_stratified_sampling_fit_and_sample_n_samples_approx_variations( df_treatment, df_pool, col_name): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert len(bins_df) == 3 ## enforcing 1 bin stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=1) stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() ## enforcing 4 bins stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name, n_bins=4) stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert len(bins_df) == 4 ## enforcing n_samples_approx=40 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.fit_and_sample( df_treatment, df_pool, n_samples_approx=40, random_seed=1, min_n_sampled_to_n_treatment_ratio=None, ) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() # should be within 1 of n_samples_approx assert abs(len(output) - 40) <= 1
def test_stratified_sampling_fit_and_sample_records_equivalence_idempotent_check( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) == set(sample2)
def test_stratified_sampling_fit_and_sample_dont_require_equivalence( df_treatment, df_pool, col_name): df_treatment["col2"] = df_treatment[col_name].astype(int) df_pool["col2"] = df_pool[col_name].astype(int) df_treatment["col3"] = df_treatment[col_name].astype(int) * 2 df_pool["col3"] = df_pool[col_name].astype(int) / 2 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3", auto_bin_require_equivalence=False) ## attempting to estimate both n_bins and n_samples stratified_sampling_obj.fit_and_sample(df_treatment, df_pool, random_seed=1) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins() assert not output.empty