def test__trend__index_levels(self): """ Check if trend() returns the proper index levels """ np.random.seed(0) metrics, metadata = generate_random_data() metrics['time_since_treatment'] = metrics['treatment_start_time'] exp = Experiment('B', metrics, metadata, [4, 6]) # Perform sga() result = exp.trend() # Check if all index levels are present index_levels = [ pd.Index([u'normal_same', u'normal_shifted', u'normal_shifted_by_feature', u'normal_unequal_variance'], dtype='object', name=u'metric'), pd.Index([u'-'], dtype='object', name=u'subgroup_metric'), pd.Index([str(x) for x in np.arange(10.)], dtype='object', name=u'time'), pd.Float64Index([], dtype='float64', name=u'subgroup'), pd.Index([u'sample_size', u'uplift', u'uplift_pctile', u'variant_mean'], dtype='object', name=u'statistic'), pd.Float64Index([2.5, 97.5], dtype='float64', name=u'pctile') ] result_levels = list(result.df.index.levels) # Check if all index levels match expectation TODO: Make nice np.testing.assert_array_equal(index_levels[0], result_levels[0]) np.testing.assert_array_equal(index_levels[1], result_levels[1]) np.testing.assert_array_equal(index_levels[2], result_levels[2]) np.testing.assert_array_equal(index_levels[3], result_levels[3]) np.testing.assert_array_equal(index_levels[4], result_levels[4]) np.testing.assert_array_equal(index_levels[5], result_levels[5])
def test_get_weights_hardcoded_data(self): ndecimals = 5 exp = Experiment(self.metadata) self.derived_kpi.make_derived_kpi(self.data_dummy_df) res = exp._get_weights(self.data_dummy_df, self.test_derived_kpi, 'B') self.assertAlmostEqual(res.iloc[0], 1.33333, ndecimals) self.assertAlmostEqual(res.iloc[1], 0.66667, ndecimals)
def test__trend__index_levels(self): """ Check if trend() returns the proper index levels """ np.random.seed(0) metrics, metadata = generate_random_data() metrics['time_since_treatment'] = metrics['treatment_start_time'] exp = Experiment('B', metrics, metadata, [4, 6]) # Perform sga() result = exp.trend() # Check if all index levels are present index_levels = [ pd.Index([u'normal_same', u'normal_shifted', u'normal_shifted_by_feature', u'normal_unequal_variance'], dtype='object', name=u'metric'), pd.Index([u'-'], dtype='object', name=u'subgroup_metric'), pd.Index(range(10), dtype='object', name=u'time'), pd.Float64Index([], dtype='float64', name=u'subgroup'), pd.Index([u'sample_size', u'uplift', u'uplift_pctile', u'variant_mean'], dtype='object', name=u'statistic'), pd.Float64Index([2.5, 97.5], dtype='float64', name=u'pctile') ] result_levels = list(result.df.index.levels) # Check if all index levels match expectation TODO: Make nice np.testing.assert_array_equal(index_levels[0], result_levels[0]) np.testing.assert_array_equal(index_levels[1], result_levels[1]) np.testing.assert_array_equal(index_levels[2], result_levels[2]) np.testing.assert_array_equal(index_levels[3], result_levels[3]) np.testing.assert_array_equal(index_levels[4], result_levels[4]) np.testing.assert_array_equal(index_levels[5], result_levels[5])
def test__trend__computation(self): """ Check if trend() functions properly """ np.random.seed(0) metrics, metadata = generate_random_data() metrics['time_since_treatment'] = metrics['treatment_start_time'] exp = Experiment('B', metrics, metadata, [4, 6]) # Perform sga() result = exp.trend() # check uplift df = result.statistic('trend', 'uplift', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, ('value', 'A')], np.array([-1.009421, -0.847400, -1.119885, -1.042597, -0.868819, -1.091165, -0.952307, -1.028234, -0.978774, -0.985696]), decimal=5) # check pctile df = result.statistic('trend', 'uplift_pctile', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, ('value', 'A')], np.array([-1.137482, -0.881360, -0.970678, -0.724122, -1.245795, -0.993975, -1.178494, -0.906699, -0.993683, -0.743954, -1.225361, -0.956969, -1.082180, -0.822435, -1.151715, -0.904753, -1.095209, -0.862340, -1.109407, -0.861985]), decimal=5) # check samplesize df = result.statistic('trend', 'sample_size', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, 'value'], np.column_stack(([649, 595, 600, 590, 625, 602, 607, 608, 616, 616], [405, 401, 378, 362, 377, 369, 406, 392, 414, 388])), decimal=5) # check variant_mean df = result.statistic('trend', 'variant_mean', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, 'value'], np.column_stack(([0.005761, 0.057487, -0.067107, 0.001125, 0.093085, -0.067894, -0.030500, -0.060996, 0.016257, -0.006091], [1.015182, 0.904887, 1.052778, 1.043721, 0.961904, 1.023271, 0.921807, 0.967238, 0.995031, 0.979605])), decimal=5)
def get_two_multiple_test_suite_results(): """ Returns two multiple test suite results (for testing purposes of merge_with class method) :return two multiple test suite results :rtype MultipleTestSuiteResult, MultipleTestSuiteResult """ data, metadata = generate_random_data() exp = Experiment(metadata) kpi = KPI('normal_same') variants = Variants('variant', 'B', 'A') test_normal_same = StatisticalTest(data, kpi, [], variants) derived_kpi = DerivedKPI('derived_kpi_one', 'normal_same', 'normal_shifted') test_derived_kpi = StatisticalTest(data, derived_kpi, [], variants) suite_with_normal_same = StatisticalTestSuite([test_normal_same], CorrectionMethod.BONFERRONI) suite_with_derived_kpi = StatisticalTestSuite([test_derived_kpi], CorrectionMethod.BH) mtsr_1 = exp.analyze_statistical_test_suite(suite_with_normal_same, test_method='fixed_horizon') mtsr_2 = exp.analyze_statistical_test_suite(suite_with_derived_kpi, test_method='fixed_horizon') return mtsr_1, mtsr_2
def run_analysis(features_file, kpis_file, metadata_file): """ Load kpis and features from file and pass them to expan to perform delta and subgroup analyses Args: features_file: features file path kpis_file: kpis file path metadata_file: metadata file path Returns: delta analysis results and subgroup analysis results as a tuple """ kpis = pd.read_csv(kpis_file) if features_file: features = pd.read_csv(features_file) else: features = 'default' print(features) metadata = parse_metadata(metadata_file) exp_data = ExperimentData(metrics=kpis, metadata=metadata, features=features) exp = Experiment(baseline_variant=metadata['baseline_variant'], metrics_or_kpis=kpis, metadata=metadata, features=features) return (exp.delta(), exp.sga())
def test_quantile_filtering_two_sided(): exp = Experiment({}) df = pd.DataFrame.from_dict({'earnings': list(range(10))}) flags = exp._quantile_filtering(df, ['earnings'], {'earnings': ('two-sided', 80.0)}) results = flags.tolist() assert results == [True] + [False] * 8 + [True]
def test_quantile_filtering_lower_old(): exp = Experiment({}) data = np.array([0, 0, 1, 2]) / np.array([0, 0, 1, 1]) df = pd.DataFrame.from_dict({'earnings': data}) flags = exp._quantile_filtering(df, ['earnings'], {'earnings': ('lower', 10.)}) assert flags.tolist() == [False, False, True, False]
def test_quantile_filtering_upper(): exp = Experiment({}) data = np.array([0.0] * 2 + list(range(10))) / np.array([0.0] * 2 + [1.0] * 10) df = pd.DataFrame.from_dict({'earnings': data}) flags = exp._quantile_filtering(df, ['earnings'], {'earnings': ('upper', 90.0)}) assert flags.tolist() == [False] * 11 + [True]
def test_quantile_filtering(): exp = Experiment({}) df = pd.DataFrame.from_dict({ 'earnings' : np.array([0,0,1,2]) / np.array([0,0,1,1]) }) flags = exp._quantile_filtering(df, ['earnings'], 90, 'upper') assert flags.tolist() == [False, False, False, True] flags = exp._quantile_filtering(df, ['earnings'], 10, 'lower') assert flags.tolist() == [False, False, True, False]
def test_quantile_filtering_two_sided_asym(): exp = Experiment({}) data = list(range(-8, 0)) + list(range(16)) df = pd.DataFrame.from_dict({'earnings': data}) flags = exp._quantile_filtering(df, ['earnings'], {'earnings': ('two-sided-asym', 50.0)}) results = flags.tolist() assert results == [True] * 2 + [False] * 18 + [True] * 4
def fixed_horizon(eid): dat = load_experiment(eid) snapshot = dat[dat.time_since_start < 100] #kpi = snapshot.groupby(['entity','variant']).converted.sum().reset_index() kpi = snapshot.groupby(['entity', 'variant']).converted.mean().reset_index() exp = Experiment(params[eid]['baseline'], kpi, metadata) res = exp.delta(kpi_subset=['converted']) return res
def test__trend__computation(self): """ Check if trend() functions properly """ np.random.seed(0) metrics, metadata = generate_random_data() metrics['time_since_treatment'] = metrics['treatment_start_time'] exp = Experiment('B', metrics, metadata, [4, 6]) # Perform sga() result = exp.trend() # check uplift df = result.statistic('trend', 'uplift', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, ('value', 'A')], np.array([ -1.009421, -0.847400, -1.119885, -1.042597, -0.868819, -1.091165, -0.952307, -1.028234, -0.978774, -0.985696 ]), decimal=5) # check pctile df = result.statistic('trend', 'uplift_pctile', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, ('value', 'A')], np.array([ -1.137482, -0.881360, -0.970678, -0.724122, -1.245795, -0.993975, -1.178494, -0.906699, -0.993683, -0.743954, -1.225361, -0.956969, -1.082180, -0.822435, -1.151715, -0.904753, -1.095209, -0.862340, -1.109407, -0.861985 ]), decimal=5) # check samplesize df = result.statistic('trend', 'sample_size', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, 'value'], np.column_stack( ([649, 595, 600, 590, 625, 602, 607, 608, 616, 616], [405, 401, 378, 362, 377, 369, 406, 392, 414, 388])), decimal=5) # check variant_mean df = result.statistic('trend', 'variant_mean', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, 'value'], np.column_stack(([ 0.005761, 0.057487, -0.067107, 0.001125, 0.093085, -0.067894, -0.030500, -0.060996, 0.016257, -0.006091 ], [ 1.015182, 0.904887, 1.052778, 1.043721, 0.961904, 1.023271, 0.921807, 0.967238, 0.995031, 0.979605 ])), decimal=5)
def get_data(folder_path): """ Expects as input a folder containing the following files: - one .csv or .csv.gz with 'data' in the filename - one .json containing 'metadata' in the filename Opens the files and uses them to create an Experiment object which it then returns. :param folder_path: path to the Experiment data :type folder_path: str :return: Experiment object with data :rtype: Experiment """ files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))] try: assert ('data' in '-'.join(files)) assert ('metadata' in '-'.join(files)) data = metadata = None for f in files: if 'metadata' in f: with open(join(folder_path, f), 'r') as input_json: metadata = json.load(input_json) elif 'data' in f: data = pd.read_csv(join(folder_path, f)) return Experiment(data, metadata) except AssertionError as e: logger.error("An error occurred when fetching data from csv file.") raise e
def run_analysis(features_file, kpis_file, metadata_file): kpis = pd.read_csv(kpis_file) if features_file: features = pd.read_csv(features_file) else: features = 'default' print(features) metadata = parse_metadata(metadata_file) exp_data = ExperimentData(metrics=kpis, metadata=metadata, features=features) exp = Experiment(baseline_variant=metadata['baseline_variant'], metrics_or_kpis=kpis, metadata=metadata, features=features) return (exp.delta(), exp.sga())
def early_stopping(eid, method, day_index): dat = load_experiment(eid) max_sample_size = float(len(np.unique(dat.entity))) print(max_sample_size) metadata['estimatedSampleSize'] = max_sample_size # daily peeking #for day in np.arange(1,np.ceil(max(dat.time_since_start))+1): snapshot = dat[dat.time_since_start < day_index] # sum #kpi = snapshot.groupby(['entity','variant']).converted.sum().reset_index() # mean kpi = snapshot.groupby(['entity', 'variant']).converted.mean().reset_index() current_sample_size = kpi.shape[0] exp = Experiment(params[eid]['baseline'], kpi, metadata) #res = exp.delta(method='group_sequential', kpi_subset=['converted'], # information_fraction=current_sample_size/max_sample_size) if 'bayes' in method: res = exp.delta(method=method, kpi_subset=['converted'], distribution='normal') elif method == 'group_sequential': res = exp.delta(method='group_sequential', kpi_subset=['converted'], information_fraction=current_sample_size / max_sample_size) else: raise NotImplementedError return (day_index, res.statistic('delta', 'stop', 'converted').loc[:, ('value', params[eid]['variant'])].values[0], res.statistic('delta', 'uplift', 'converted').loc[:, ('value', params[eid]['variant'])].values[0])
def setUp(self): """ Load the needed datasets for all TestCases and set the random seed so that randomized algorithms show deterministic behaviour. """ np.random.seed(0) self.data = Experiment('B', *generate_random_data()) # Create time column. TODO: Do this nicer self.data.kpis['time_since_treatment'] = \ self.data.features['treatment_start_time'] # Make time part of index self.data.kpis.set_index('time_since_treatment', append=True, inplace=True)
def setUp(self): """ Load the needed datasets for all StatisticsTestCases and set the random seed so that randomized algorithms show deterministic behaviour. """ np.random.seed(0) self.data = Experiment('B', *generate_random_data(), dbg=Dbg(dbg_lvl=5)) # Create time column. TODO: Do this nicer self.data.kpis['time_since_treatment'] = \ self.data.features['treatment_start_time'] # Make time part of index self.data.kpis.set_index('time_since_treatment', append=True, inplace=True) # Metadata as generated by generate_random_data() for later checks self.testmetadata = {'primary_KPI': 'normal_shifted', 'source': 'simulated', 'experiment': 'random_data_generation'}
def get_data(controlVariantName, folder_path): """ Expects as input a folder containing the following files: - one .csv or .csv.gz with 'metrics' in the filename - one .txt containing 'metadata' in the filename Opens the files and uses them to create an Experiment object which it then returns. Args: folder_path: Returns: Experiment: Experiment object with loaded csv data """ files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))] try: assert ('metrics' in '-'.join(files)) assert ('metadata' in '-'.join(files)) metrics = metadata = None for f in files: if 'metrics' in f: metrics = pd.read_csv(join(folder_path, f)) elif 'metadata' in f: with open(join(folder_path, f), 'r') as input_json: metadata = json.load(input_json) return Experiment(controlVariantName, metrics, metadata) except AssertionError as e: logger.error("An error occured when fetching data from csv file.") raise e
from expan.core.util import generate_random_data from expan.core.experiment import Experiment from expan.core.statistical_test import KPI, Variants, StatisticalTest data, metadata = generate_random_data() print(data.head()) print(metadata) kpi = KPI('normal_same') variants = Variants(variant_column_name='variant', control_name='B', treatment_name='A') test = StatisticalTest(data=data, kpi=kpi, features=[], variants=variants) exp = Experiment(metadata=metadata) result = exp.analyze_statistical_test(test) print(result)
def getExperiment(self): return Experiment(self.metadata)
res = self.data.delta() df = res.relative_uplift('delta', 'normal_same') np.testing.assert_almost_equal(df, np.array([[-4.219601, 0]]), decimal=5) def test_prob_uplift_over_zero_single_metric(self): """Check if the conversion from confidence intervals to probability is correct for one metric.""" res = self.data.delta(kpi_subset=['normal_same']) #df = prob_uplift_over_zero_single_metric(res.df, self.data.baseline_variant) np.testing.assert_almost_equal( res.df.loc[pd.IndexSlice[:, :, :, 'prob_uplift_over_0'], 'value'], np.array([[0.946519, np.nan]]), decimal=5) def test_prob_uplift_over_zero_multiple_metric(self): """Check if the conversion from confidence intervals to probability is correct for multiple metrics.""" res = self.data.delta(kpi_subset=['normal_same', 'normal_shifted']) #res.calculate_prob_uplift_over_zero() np.testing.assert_almost_equal( res.df.loc[pd.IndexSlice[:, :, :, 'prob_uplift_over_0'], 'value'], np.array([[0.946519, np.nan], [0, np.nan]]), decimal=5) if __name__ == '__main__': #unittest.main() np.random.seed(0) exp = Experiment('B', *generate_random_data()) res = exp.delta(['normal_shifted'])
def getExperiment(self, report_kpi_names=None, derived_kpis=[]): return Experiment('B', self.data, self.metadata, report_kpi_names, derived_kpis)
import expan.core.early_stopping as es import expan.core.statistics as statx from expan.core.experiment import Experiment import pytest import unittest import numpy as np worker_table = Experiment(None).worker_table def tuple_approx(*elements): assert isinstance(elements, tuple) def helper_for__tuple_approx(x): if not hasattr(x, '__len__'): return pytest.approx(x) else: assert isinstance(x, tuple) return tuple(helper_for__tuple_approx(element) for element in x) return helper_for__tuple_approx(elements) def deltastats_to_friendly_tuple(ds): from collections import namedtuple Flat_delta_stats = namedtuple('Flat_delta_stats', 'delta p power stop control_stats treatment_stats c_i') tup = Flat_delta_stats( ds.delta, ds.p, ds.statistical_power, ds.stop if hasattr(ds,'stop') else None, (ds.control_statistics .mean,ds.control_statistics .sample_size,ds.control_statistics .variance),
def getExperiment(self): return Experiment(self.metadata, error=0.)
}) df.set_index(Results.mandatory_index_levels, inplace=True) # df = df.unstack('variant') # df.columns = df.columns.swaplevel(0,1) return df if __name__ == '__main__': #pass np.random.seed(0) from tests.tests_core.test_data import generate_random_data from expan.core.experiment import Experiment data = Experiment('B', *generate_random_data()) res = data.delta(kpi_subset=['normal_same', 'normal_shifted']) # df = res.calculate_prob_uplift_over_zero() # from test_core.test_results import load_example_results # aa = load_example_results() # order_means = aa.means('orders').iloc[0] # net_sales_var = aa.statistic('var', 'net_sales') # import numpy as np # res = Results(None) # res.append_delta('dummy', 'A', *(0.1,{'2.5':0.01,'97.5':0.2},1000,1000)) # res.append_delta('dummy', 'B', *(0,{'2.5':np.nan,'97.5':np.nan},1000,1000)) # from expan.core.experiment import Experiment #
# query = query_file.read() # bq_df = pd.read_gbq(query=query, project_id="team-octopus", verbose=False) df_before = pd.read_csv('size_drop_down.csv') df_before = df_before.loc[df_before['variant'].isin( ['ControlSingleWithToggle', 'SingleNoToggle'])] print("BEFORE") print("data size:", len(df_before)) print("number of variants:", len(df_before['variant'].unique())) print("number of client ids:", len(df_before['entity'].unique())) exp = Experiment(control_variant_name='ControlSingleWithToggle', data=df_before, metadata={}, report_kpi_names=['conversion_rate'], derived_kpis=[{ 'name': 'conversion_rate', 'formula': 'orders/sessions' }]) print(exp.delta()) # user_id = df_before['entity'].value_counts().idxmax() # freq = df_before['entity'].value_counts().max() # print("user " + user_id + " has appeared: " + str(freq) + " times.") # print(df_before[df_before['entity']=='403044ca-c5bc-40ad-96cc-a41af47063ee']) df_after = pd.read_csv('size_drop_down_after_removal.csv') df_after = df_after.loc[df_after['variant'].isin( ['ControlSingleWithToggle', 'SingleNoToggle'])] print("AFTER") print("data size:", len(df_after))
def test__trend__computation(self): """ Check if trend() functions properly """ np.random.seed(0) metrics, metadata = generate_random_data() metrics['time_since_treatment'] = metrics['treatment_start_time'] exp = Experiment('B', metrics, metadata, [4, 6]) # Perform sga() with non-cumulative results result = exp.trend(cumulative=False) # check uplift df = result.statistic('trend', 'uplift', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, ('value', 'A')], np.array([ -1.009421, -0.847400, -1.119885, -1.042597, -0.868819, -1.091165, -0.952307, -1.028234, -0.978774, -0.985696 ]), decimal=5) # check pctile df = result.statistic('trend', 'uplift_pctile', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, ('value', 'A')], np.array([ -1.137482, -0.881360, -0.970678, -0.724122, -1.245795, -0.993975, -1.178494, -0.906699, -0.993683, -0.743954, -1.225361, -0.956969, -1.082180, -0.822435, -1.151715, -0.904753, -1.095209, -0.862340, -1.109407, -0.861985 ]), decimal=5) # check samplesize df = result.statistic('trend', 'sample_size', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, 'value'], np.column_stack( ([649, 595, 600, 590, 625, 602, 607, 608, 616, 616], [405, 401, 378, 362, 377, 369, 406, 392, 414, 388])), decimal=5) # check variant_mean df = result.statistic('trend', 'variant_mean', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, 'value'], np.column_stack(([ 0.005761, 0.057487, -0.067107, 0.001125, 0.093085, -0.067894, -0.030500, -0.060996, 0.016257, -0.006091 ], [ 1.015182, 0.904887, 1.052778, 1.043721, 0.961904, 1.023271, 0.921807, 0.967238, 0.995031, 0.979605 ])), decimal=5) # Perform sga() with cumulative results result = exp.trend() # check uplift df = result.statistic('trend', 'uplift', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, ('value', 'A')], np.array([ -1.009421, -0.929807, -0.991088, -1.003129, -0.976023, -0.994857, -0.988167, -0.993119, -0.991571, -0.990986 ]), decimal=5) # check pctile df = result.statistic('trend', 'uplift_pctile', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, ('value', 'A')], np.array([ -1.137482, -0.881360, -1.018794, -0.840820, -1.063820, -0.918356, -1.067283, -0.938976, -1.033110, -0.918936, -1.047413, -0.942302, -1.036888, -0.939446, -1.038455, -0.947784, -1.033861, -0.949280, -1.031002, -0.950970 ]), decimal=5) # check samplesize df = result.statistic('trend', 'sample_size', 'normal_shifted') np.testing.assert_almost_equal( df.loc[:, 'value'], np.column_stack( ([649, 1244, 1844, 2434, 3059, 3661, 4268, 4876, 5492, 6108], [405, 806, 1184, 1546, 1923, 2292, 2698, 3090, 3504, 3892])), decimal=5) # check variant_mean df = result.statistic('trend', 'variant_mean', 'normal_shifted') np.testing.assert_almost_equal(df.loc[:, 'value'], np.column_stack(([ 0.005761, 0.030501, -0.001258, -0.000681, 0.018477, 0.004274, -0.000671, -0.008193, -0.005451, -0.005515 ], [ 1.015182, 0.960308, 0.989830, 1.002449, 0.994500, 0.999132, 0.987496, 0.984926, 0.986120, 0.985470 ])), decimal=5) # check metadata is preserved np.testing.assert_equal( True, all(item in result.metadata.items() for item in self.testmetadata.items()))
import numpy as np import pandas as pd from expan.core.experiment import Experiment ################################################### df_before = pd.read_csv('nextgen_header.csv') print("BEFORE") print("data size:", len(df_before)) print("number of variants:", len(df_before['variant'].unique())) print("number of client ids:", len(df_before['entity'].unique())) exp = Experiment(control_variant_name='control', data=df_before, metadata={}, report_kpi_names=['revenue_per_user'], derived_kpis=[{ 'name': 'revenue_per_user', 'formula': 'revenue/users' }]) print(exp.delta()) ################################################### df_after = pd.read_csv('nextgen_header_after_removal.csv') print("AFTER") print("data size:", len(df_after)) print("number of variants:", len(df_after['variant'].unique())) print("number of client ids:", len(df_after['entity'].unique())) exp = Experiment(control_variant_name='control', data=df_after,