def test_plot_records_based_equiv_average_chisquare( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") bin_selection = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) bin_selection.plot_records_based_equiv_average(plot=False) results = bin_selection.results_as_json() assert 'bins_selected_str' in list(results['n_bin_results'][0].keys())
def test_stratified_sampling_fit_and_sample_records_equivalence_idempotent_check( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) sample1 = stratified_sampling_obj.data_sample.df.index.values stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) sample2 = stratified_sampling_obj.data_sample.df.index.values assert set(sample1) == set(sample2)
def test_plot_records_based_equiv_average(df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): df_treatment["col2"] = df_treatment[col_name] * 2 df_treatment["col3"] = df_treatment[col_name] * 3 df_pool["col2"] = df_pool[col_name] * 2 df_pool["col3"] = df_pool[col_name] * 3 stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") stratified_sampling_obj.add_column("col3") bin_selection = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=2, max_n_bins=3, random_seed=1, equivalence_method='euclidean', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) bin_selection.plot_records_based_equiv_average(plot=False) bin_selection.results_as_json()
def test_stratified_sampling_fit_and_sample_records_equivalence_too_many_bins( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): stratified_sampling_obj = StratifiedSampling() stratified_sampling_obj.add_column(col_name) ## attempting to estimate both n_bins and n_samples with pytest.raises(ModelSamplingException): model_w_selected_bins = StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=1000, max_n_bins=1002, random_seed=1, equivalence_method='chisquare', relax_n_samples_approx_constraint=False, equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix)
def test_stratified_sampling_fit_and_sample_records_equivalence( df_treatment, df_pool, col_name, equivalence_feature_ids, equivalence_feature_matrix): stratified_sampling_obj = StratifiedSampling() df_pool["col2"] = df_pool[col_name] df_treatment["col2"] = df_treatment[col_name] stratified_sampling_obj.add_column(col_name) stratified_sampling_obj.add_column("col2") ## attempting to estimate both n_bins and n_samples StratifiedSamplingBinSelector( stratified_sampling_obj, df_treatment, df_pool, min_n_bins=4, max_n_bins=6, random_seed=1, equivalence_method='chisquare', equivalence_feature_ids=equivalence_feature_ids, equivalence_feature_matrix=equivalence_feature_matrix) output = stratified_sampling_obj.data_sample.df bins_df = stratified_sampling_obj.diagnostics().count_bins()