예제 #1
0
	def test__trend__index_levels(self):
		"""
	    Check if trend() returns the proper index levels
	    """
		np.random.seed(0)
		metrics, metadata = generate_random_data()
		metrics['time_since_treatment'] = metrics['treatment_start_time']
		exp = Experiment('B', metrics, metadata, [4, 6])
		# Perform sga()
		result = exp.trend()
		# Check if all index levels are present
		index_levels = [
			pd.Index([u'normal_same', u'normal_shifted', u'normal_shifted_by_feature', u'normal_unequal_variance'],
					 dtype='object', name=u'metric'),
			pd.Index([u'-'], dtype='object', name=u'subgroup_metric'),
			pd.Index([str(x) for x in np.arange(10.)], dtype='object', name=u'time'),
			pd.Float64Index([], dtype='float64', name=u'subgroup'),
			pd.Index([u'sample_size', u'uplift', u'uplift_pctile', u'variant_mean'], dtype='object', name=u'statistic'),
			pd.Float64Index([2.5, 97.5], dtype='float64', name=u'pctile')
		]
		result_levels = list(result.df.index.levels)
		# Check if all index levels match expectation TODO: Make nice
		np.testing.assert_array_equal(index_levels[0], result_levels[0])
		np.testing.assert_array_equal(index_levels[1], result_levels[1])
		np.testing.assert_array_equal(index_levels[2], result_levels[2])
		np.testing.assert_array_equal(index_levels[3], result_levels[3])
		np.testing.assert_array_equal(index_levels[4], result_levels[4])
		np.testing.assert_array_equal(index_levels[5], result_levels[5])
예제 #2
0
 def test_get_weights_hardcoded_data(self):
     ndecimals = 5
     exp = Experiment(self.metadata)
     self.derived_kpi.make_derived_kpi(self.data_dummy_df)
     res = exp._get_weights(self.data_dummy_df, self.test_derived_kpi, 'B')
     self.assertAlmostEqual(res.iloc[0], 1.33333, ndecimals)
     self.assertAlmostEqual(res.iloc[1], 0.66667, ndecimals)
예제 #3
0
	def test__trend__index_levels(self):
		"""
    Check if trend() returns the proper index levels
    """
		np.random.seed(0)
		metrics, metadata = generate_random_data()
		metrics['time_since_treatment'] = metrics['treatment_start_time']
		exp = Experiment('B', metrics, metadata, [4, 6])
		# Perform sga()
		result = exp.trend()
		# Check if all index levels are present
		index_levels = [
			pd.Index([u'normal_same', u'normal_shifted', u'normal_shifted_by_feature', u'normal_unequal_variance'],
					 dtype='object', name=u'metric'),
			pd.Index([u'-'], dtype='object', name=u'subgroup_metric'),
			pd.Index(range(10), dtype='object', name=u'time'),
			pd.Float64Index([], dtype='float64', name=u'subgroup'),
			pd.Index([u'sample_size', u'uplift', u'uplift_pctile', u'variant_mean'], dtype='object', name=u'statistic'),
			pd.Float64Index([2.5, 97.5], dtype='float64', name=u'pctile')
		]
		result_levels = list(result.df.index.levels)
		# Check if all index levels match expectation TODO: Make nice
		np.testing.assert_array_equal(index_levels[0], result_levels[0])
		np.testing.assert_array_equal(index_levels[1], result_levels[1])
		np.testing.assert_array_equal(index_levels[2], result_levels[2])
		np.testing.assert_array_equal(index_levels[3], result_levels[3])
		np.testing.assert_array_equal(index_levels[4], result_levels[4])
		np.testing.assert_array_equal(index_levels[5], result_levels[5])
예제 #4
0
	def test__trend__computation(self):
		"""
    Check if trend() functions properly
    """
		np.random.seed(0)
		metrics, metadata = generate_random_data()
		metrics['time_since_treatment'] = metrics['treatment_start_time']
		exp = Experiment('B', metrics, metadata, [4, 6])
		# Perform sga()
		result = exp.trend()

		# check uplift
		df = result.statistic('trend', 'uplift', 'normal_shifted')
		np.testing.assert_almost_equal(df.loc[:, ('value', 'A')],
									   np.array([-1.009421, -0.847400, -1.119885, -1.042597, -0.868819,
												 -1.091165, -0.952307, -1.028234, -0.978774, -0.985696]), decimal=5)
		# check pctile
		df = result.statistic('trend', 'uplift_pctile', 'normal_shifted')
		np.testing.assert_almost_equal(df.loc[:, ('value', 'A')],
									   np.array([-1.137482, -0.881360, -0.970678, -0.724122, -1.245795,
												 -0.993975, -1.178494, -0.906699, -0.993683, -0.743954, -1.225361,
												 -0.956969, -1.082180, -0.822435, -1.151715, -0.904753, -1.095209,
												 -0.862340, -1.109407, -0.861985]), decimal=5)
		# check samplesize
		df = result.statistic('trend', 'sample_size', 'normal_shifted')
		np.testing.assert_almost_equal(df.loc[:, 'value'],
									   np.column_stack(([649, 595, 600, 590, 625, 602, 607, 608, 616, 616],
														[405, 401, 378, 362, 377, 369, 406, 392, 414, 388])), decimal=5)
		# check variant_mean
		df = result.statistic('trend', 'variant_mean', 'normal_shifted')
		np.testing.assert_almost_equal(df.loc[:, 'value'],
									   np.column_stack(([0.005761, 0.057487, -0.067107, 0.001125, 0.093085,
														 -0.067894, -0.030500, -0.060996, 0.016257, -0.006091],
														[1.015182, 0.904887, 1.052778, 1.043721, 0.961904, 1.023271,
														 0.921807, 0.967238, 0.995031, 0.979605])), decimal=5)
예제 #5
0
def get_two_multiple_test_suite_results():
    """ Returns two multiple test suite results (for testing purposes of merge_with class method)
    
    :return two multiple test suite results
    :rtype  MultipleTestSuiteResult, MultipleTestSuiteResult
    """
    data, metadata = generate_random_data()
    exp = Experiment(metadata)

    kpi = KPI('normal_same')
    variants = Variants('variant', 'B', 'A')
    test_normal_same = StatisticalTest(data, kpi, [], variants)
    derived_kpi = DerivedKPI('derived_kpi_one', 'normal_same',
                             'normal_shifted')
    test_derived_kpi = StatisticalTest(data, derived_kpi, [], variants)

    suite_with_normal_same = StatisticalTestSuite([test_normal_same],
                                                  CorrectionMethod.BONFERRONI)
    suite_with_derived_kpi = StatisticalTestSuite([test_derived_kpi],
                                                  CorrectionMethod.BH)

    mtsr_1 = exp.analyze_statistical_test_suite(suite_with_normal_same,
                                                test_method='fixed_horizon')
    mtsr_2 = exp.analyze_statistical_test_suite(suite_with_derived_kpi,
                                                test_method='fixed_horizon')

    return mtsr_1, mtsr_2
예제 #6
0
def run_analysis(features_file, kpis_file, metadata_file):
    """
		Load kpis and features from file and pass them to expan to perform delta and subgroup analyses

		Args:
			features_file: features file path
			kpis_file: kpis file path
			metadata_file: metadata file path

		Returns:
			delta analysis results and subgroup analysis results as a tuple
	"""
    kpis = pd.read_csv(kpis_file)
    if features_file:
        features = pd.read_csv(features_file)
    else:
        features = 'default'
    print(features)
    metadata = parse_metadata(metadata_file)

    exp_data = ExperimentData(metrics=kpis,
                              metadata=metadata,
                              features=features)
    exp = Experiment(baseline_variant=metadata['baseline_variant'],
                     metrics_or_kpis=kpis,
                     metadata=metadata,
                     features=features)

    return (exp.delta(), exp.sga())
예제 #7
0
def test_quantile_filtering_two_sided():
    exp = Experiment({})
    df = pd.DataFrame.from_dict({'earnings': list(range(10))})

    flags = exp._quantile_filtering(df, ['earnings'],
                                    {'earnings': ('two-sided', 80.0)})
    results = flags.tolist()
    assert results == [True] + [False] * 8 + [True]
예제 #8
0
def test_quantile_filtering_lower_old():
    exp = Experiment({})
    data = np.array([0, 0, 1, 2]) / np.array([0, 0, 1, 1])
    df = pd.DataFrame.from_dict({'earnings': data})

    flags = exp._quantile_filtering(df, ['earnings'],
                                    {'earnings': ('lower', 10.)})
    assert flags.tolist() == [False, False, True, False]
예제 #9
0
def test_quantile_filtering_upper():
    exp = Experiment({})
    data = np.array([0.0] * 2 + list(range(10))) / np.array([0.0] * 2 +
                                                            [1.0] * 10)
    df = pd.DataFrame.from_dict({'earnings': data})

    flags = exp._quantile_filtering(df, ['earnings'],
                                    {'earnings': ('upper', 90.0)})
    assert flags.tolist() == [False] * 11 + [True]
예제 #10
0
def test_quantile_filtering():
    exp = Experiment({})
    df = pd.DataFrame.from_dict({   'earnings' : np.array([0,0,1,2]) / np.array([0,0,1,1]) })

    flags = exp._quantile_filtering(df, ['earnings'], 90, 'upper')
    assert flags.tolist() == [False, False, False, True]

    flags = exp._quantile_filtering(df, ['earnings'], 10, 'lower')
    assert flags.tolist() == [False, False, True, False]
예제 #11
0
def test_quantile_filtering_two_sided_asym():
    exp = Experiment({})
    data = list(range(-8, 0)) + list(range(16))
    df = pd.DataFrame.from_dict({'earnings': data})

    flags = exp._quantile_filtering(df, ['earnings'],
                                    {'earnings': ('two-sided-asym', 50.0)})
    results = flags.tolist()
    assert results == [True] * 2 + [False] * 18 + [True] * 4
예제 #12
0
def fixed_horizon(eid):
    dat = load_experiment(eid)
    snapshot = dat[dat.time_since_start < 100]
    #kpi = snapshot.groupby(['entity','variant']).converted.sum().reset_index()
    kpi = snapshot.groupby(['entity',
                            'variant']).converted.mean().reset_index()
    exp = Experiment(params[eid]['baseline'], kpi, metadata)
    res = exp.delta(kpi_subset=['converted'])

    return res
예제 #13
0
    def test__trend__computation(self):
        """
    Check if trend() functions properly
    """
        np.random.seed(0)
        metrics, metadata = generate_random_data()
        metrics['time_since_treatment'] = metrics['treatment_start_time']
        exp = Experiment('B', metrics, metadata, [4, 6])
        # Perform sga()
        result = exp.trend()

        # check uplift
        df = result.statistic('trend', 'uplift', 'normal_shifted')
        np.testing.assert_almost_equal(df.loc[:, ('value', 'A')],
                                       np.array([
                                           -1.009421, -0.847400, -1.119885,
                                           -1.042597, -0.868819, -1.091165,
                                           -0.952307, -1.028234, -0.978774,
                                           -0.985696
                                       ]),
                                       decimal=5)
        # check pctile
        df = result.statistic('trend', 'uplift_pctile', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, ('value', 'A')],
            np.array([
                -1.137482, -0.881360, -0.970678, -0.724122, -1.245795,
                -0.993975, -1.178494, -0.906699, -0.993683, -0.743954,
                -1.225361, -0.956969, -1.082180, -0.822435, -1.151715,
                -0.904753, -1.095209, -0.862340, -1.109407, -0.861985
            ]),
            decimal=5)
        # check samplesize
        df = result.statistic('trend', 'sample_size', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, 'value'],
            np.column_stack(
                ([649, 595, 600, 590, 625, 602, 607, 608, 616,
                  616], [405, 401, 378, 362, 377, 369, 406, 392, 414, 388])),
            decimal=5)
        # check variant_mean
        df = result.statistic('trend', 'variant_mean', 'normal_shifted')
        np.testing.assert_almost_equal(df.loc[:, 'value'],
                                       np.column_stack(([
                                           0.005761, 0.057487, -0.067107,
                                           0.001125, 0.093085, -0.067894,
                                           -0.030500, -0.060996, 0.016257,
                                           -0.006091
                                       ], [
                                           1.015182, 0.904887, 1.052778,
                                           1.043721, 0.961904, 1.023271,
                                           0.921807, 0.967238, 0.995031,
                                           0.979605
                                       ])),
                                       decimal=5)
예제 #14
0
def get_data(folder_path):
    """ Expects as input a folder containing the following files:

    - one .csv or .csv.gz with 'data' in the filename
    - one .json containing 'metadata' in the filename

    Opens the files and uses them to create an Experiment object which it then returns.

    :param folder_path: path to the Experiment data
    :type  folder_path: str
    :return: Experiment object with data
    :rtype:  Experiment
    """
    files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]

    try:
        assert ('data' in '-'.join(files))
        assert ('metadata' in '-'.join(files))
        data = metadata = None
        for f in files:
            if 'metadata' in f:
                with open(join(folder_path, f), 'r') as input_json:
                    metadata = json.load(input_json)
            elif 'data' in f:
                data = pd.read_csv(join(folder_path, f))
        return Experiment(data, metadata)

    except AssertionError as e:
        logger.error("An error occurred when fetching data from csv file.")
        raise e
예제 #15
0
def run_analysis(features_file, kpis_file, metadata_file):
    kpis = pd.read_csv(kpis_file)
    if features_file:
        features = pd.read_csv(features_file)
    else:
        features = 'default'
    print(features)
    metadata = parse_metadata(metadata_file)

    exp_data = ExperimentData(metrics=kpis,
                              metadata=metadata,
                              features=features)
    exp = Experiment(baseline_variant=metadata['baseline_variant'],
                     metrics_or_kpis=kpis,
                     metadata=metadata,
                     features=features)

    return (exp.delta(), exp.sga())
예제 #16
0
def early_stopping(eid, method, day_index):
    dat = load_experiment(eid)
    max_sample_size = float(len(np.unique(dat.entity)))
    print(max_sample_size)
    metadata['estimatedSampleSize'] = max_sample_size
    # daily peeking
    #for day in np.arange(1,np.ceil(max(dat.time_since_start))+1):
    snapshot = dat[dat.time_since_start < day_index]

    # sum
    #kpi = snapshot.groupby(['entity','variant']).converted.sum().reset_index()
    # mean
    kpi = snapshot.groupby(['entity',
                            'variant']).converted.mean().reset_index()

    current_sample_size = kpi.shape[0]
    exp = Experiment(params[eid]['baseline'], kpi, metadata)
    #res = exp.delta(method='group_sequential', kpi_subset=['converted'],
    #	information_fraction=current_sample_size/max_sample_size)
    if 'bayes' in method:
        res = exp.delta(method=method,
                        kpi_subset=['converted'],
                        distribution='normal')
    elif method == 'group_sequential':
        res = exp.delta(method='group_sequential',
                        kpi_subset=['converted'],
                        information_fraction=current_sample_size /
                        max_sample_size)
    else:
        raise NotImplementedError

    return (day_index,
            res.statistic('delta', 'stop',
                          'converted').loc[:,
                                           ('value',
                                            params[eid]['variant'])].values[0],
            res.statistic('delta', 'uplift',
                          'converted').loc[:,
                                           ('value',
                                            params[eid]['variant'])].values[0])
예제 #17
0
    def setUp(self):
        """
	    Load the needed datasets for all TestCases and set the random
	    seed so that randomized algorithms show deterministic behaviour.
	    """
        np.random.seed(0)
        self.data = Experiment('B', *generate_random_data())
        # Create time column. TODO: Do this nicer
        self.data.kpis['time_since_treatment'] = \
         self.data.features['treatment_start_time']
        # Make time part of index
        self.data.kpis.set_index('time_since_treatment',
                                 append=True,
                                 inplace=True)
예제 #18
0
파일: util.py 프로젝트: zalando/expan
def get_two_multiple_test_suite_results():
    """ Returns two multiple test suite results (for testing purposes of merge_with class method)
    
    :return two multiple test suite results
    :rtype  MultipleTestSuiteResult, MultipleTestSuiteResult
    """
    data, metadata = generate_random_data()
    exp = Experiment(metadata)

    kpi = KPI('normal_same')
    variants = Variants('variant', 'B', 'A')
    test_normal_same = StatisticalTest(data, kpi, [], variants)
    derived_kpi = DerivedKPI('derived_kpi_one', 'normal_same', 'normal_shifted')
    test_derived_kpi = StatisticalTest(data, derived_kpi, [], variants)

    suite_with_normal_same = StatisticalTestSuite([test_normal_same],
                                                  CorrectionMethod.BONFERRONI)
    suite_with_derived_kpi = StatisticalTestSuite([test_derived_kpi], CorrectionMethod.BH)

    mtsr_1 = exp.analyze_statistical_test_suite(suite_with_normal_same, test_method='fixed_horizon')
    mtsr_2 = exp.analyze_statistical_test_suite(suite_with_derived_kpi, test_method='fixed_horizon')

    return mtsr_1, mtsr_2
예제 #19
0
	def setUp(self):
		"""
	    Load the needed datasets for all StatisticsTestCases and set the random
	    seed so that randomized algorithms show deterministic behaviour.
	    """
		np.random.seed(0)
		self.data = Experiment('B', *generate_random_data(), dbg=Dbg(dbg_lvl=5))
		# Create time column. TODO: Do this nicer
		self.data.kpis['time_since_treatment'] = \
			self.data.features['treatment_start_time']
		# Make time part of index
		self.data.kpis.set_index('time_since_treatment', append=True, inplace=True)
		# Metadata as generated by generate_random_data() for later checks
		self.testmetadata = {'primary_KPI': 'normal_shifted',
		                     'source': 'simulated',
		                     'experiment': 'random_data_generation'}
예제 #20
0
파일: csv_fetcher.py 프로젝트: vdt/expan
def get_data(controlVariantName, folder_path):
    """
    Expects as input a folder containing the following files:
     - one .csv or .csv.gz with 'metrics' in the filename
     - one .txt containing 'metadata' in the filename

    Opens the files and uses them to create an Experiment object which it then returns.

    Args:
        folder_path:

    Returns:
        Experiment: Experiment object with loaded csv data

    """
    files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]

    try:
        assert ('metrics' in '-'.join(files))
        assert ('metadata' in '-'.join(files))

        metrics = metadata = None

        for f in files:

            if 'metrics' in f:
                metrics = pd.read_csv(join(folder_path, f))

            elif 'metadata' in f:
                with open(join(folder_path, f), 'r') as input_json:
                    metadata = json.load(input_json)

        return Experiment(controlVariantName, metrics, metadata)

    except AssertionError as e:
        logger.error("An error occured when fetching data from csv file.")
        raise e
예제 #21
0
from expan.core.util import generate_random_data
from expan.core.experiment import Experiment
from expan.core.statistical_test import KPI, Variants, StatisticalTest

data, metadata = generate_random_data()

print(data.head())
print(metadata)

kpi = KPI('normal_same')
variants = Variants(variant_column_name='variant',
                    control_name='B',
                    treatment_name='A')
test = StatisticalTest(data=data, kpi=kpi, features=[], variants=variants)
exp = Experiment(metadata=metadata)

result = exp.analyze_statistical_test(test)

print(result)
예제 #22
0
 def getExperiment(self):
     return Experiment(self.metadata)
예제 #23
0
        res = self.data.delta()
        df = res.relative_uplift('delta', 'normal_same')
        np.testing.assert_almost_equal(df,
                                       np.array([[-4.219601, 0]]),
                                       decimal=5)

    def test_prob_uplift_over_zero_single_metric(self):
        """Check if the conversion from confidence intervals to probability is correct for one metric."""
        res = self.data.delta(kpi_subset=['normal_same'])
        #df = prob_uplift_over_zero_single_metric(res.df, self.data.baseline_variant)
        np.testing.assert_almost_equal(
            res.df.loc[pd.IndexSlice[:, :, :, 'prob_uplift_over_0'], 'value'],
            np.array([[0.946519, np.nan]]),
            decimal=5)

    def test_prob_uplift_over_zero_multiple_metric(self):
        """Check if the conversion from confidence intervals to probability is correct for multiple metrics."""
        res = self.data.delta(kpi_subset=['normal_same', 'normal_shifted'])
        #res.calculate_prob_uplift_over_zero()
        np.testing.assert_almost_equal(
            res.df.loc[pd.IndexSlice[:, :, :, 'prob_uplift_over_0'], 'value'],
            np.array([[0.946519, np.nan], [0, np.nan]]),
            decimal=5)


if __name__ == '__main__':
    #unittest.main()
    np.random.seed(0)
    exp = Experiment('B', *generate_random_data())
    res = exp.delta(['normal_shifted'])
예제 #24
0
 def getExperiment(self, report_kpi_names=None, derived_kpis=[]):
     return Experiment('B', self.data, self.metadata, report_kpi_names,
                       derived_kpis)
예제 #25
0
import expan.core.early_stopping as es
import expan.core.statistics as statx
from expan.core.experiment import Experiment

import pytest
import unittest

import numpy as np

worker_table = Experiment(None).worker_table

def tuple_approx(*elements):
    assert isinstance(elements, tuple)

    def helper_for__tuple_approx(x):
        if not hasattr(x, '__len__'):
            return pytest.approx(x)
        else:
            assert isinstance(x, tuple)
            return tuple(helper_for__tuple_approx(element) for element in x)

    return helper_for__tuple_approx(elements)

def deltastats_to_friendly_tuple(ds):
    from collections import namedtuple
    Flat_delta_stats = namedtuple('Flat_delta_stats', 'delta p power stop control_stats treatment_stats c_i')
    tup = Flat_delta_stats(    ds.delta,
                ds.p,
                ds.statistical_power,
                ds.stop if hasattr(ds,'stop') else None,
                (ds.control_statistics  .mean,ds.control_statistics  .sample_size,ds.control_statistics  .variance),
예제 #26
0
 def getExperiment(self):
     return Experiment(self.metadata, error=0.)
예제 #27
0
        })

    df.set_index(Results.mandatory_index_levels, inplace=True)
    # df = df.unstack('variant')
    # df.columns = df.columns.swaplevel(0,1)

    return df


if __name__ == '__main__':
    #pass

    np.random.seed(0)
    from tests.tests_core.test_data import generate_random_data
    from expan.core.experiment import Experiment
    data = Experiment('B', *generate_random_data())
    res = data.delta(kpi_subset=['normal_same', 'normal_shifted'])
    # df = res.calculate_prob_uplift_over_zero()

    # from test_core.test_results import load_example_results
    # aa = load_example_results()
    # order_means = aa.means('orders').iloc[0]
    # net_sales_var = aa.statistic('var', 'net_sales')

    # import numpy as np
    # res = Results(None)
    # res.append_delta('dummy', 'A', *(0.1,{'2.5':0.01,'97.5':0.2},1000,1000))
    # res.append_delta('dummy', 'B', *(0,{'2.5':np.nan,'97.5':np.nan},1000,1000))

    # from expan.core.experiment import Experiment
    #
#     query = query_file.read()

# bq_df = pd.read_gbq(query=query, project_id="team-octopus", verbose=False)

df_before = pd.read_csv('size_drop_down.csv')
df_before = df_before.loc[df_before['variant'].isin(
    ['ControlSingleWithToggle', 'SingleNoToggle'])]
print("BEFORE")
print("data size:", len(df_before))
print("number of variants:", len(df_before['variant'].unique()))
print("number of client ids:", len(df_before['entity'].unique()))

exp = Experiment(control_variant_name='ControlSingleWithToggle',
                 data=df_before,
                 metadata={},
                 report_kpi_names=['conversion_rate'],
                 derived_kpis=[{
                     'name': 'conversion_rate',
                     'formula': 'orders/sessions'
                 }])
print(exp.delta())

# user_id = df_before['entity'].value_counts().idxmax()
# freq = df_before['entity'].value_counts().max()
# print("user " + user_id + " has appeared: " + str(freq) + " times.")
# print(df_before[df_before['entity']=='403044ca-c5bc-40ad-96cc-a41af47063ee'])

df_after = pd.read_csv('size_drop_down_after_removal.csv')
df_after = df_after.loc[df_after['variant'].isin(
    ['ControlSingleWithToggle', 'SingleNoToggle'])]
print("AFTER")
print("data size:", len(df_after))
예제 #29
0
    def test__trend__computation(self):
        """
	    Check if trend() functions properly
	    """
        np.random.seed(0)
        metrics, metadata = generate_random_data()
        metrics['time_since_treatment'] = metrics['treatment_start_time']
        exp = Experiment('B', metrics, metadata, [4, 6])
        # Perform sga() with non-cumulative results
        result = exp.trend(cumulative=False)

        # check uplift
        df = result.statistic('trend', 'uplift', 'normal_shifted')
        np.testing.assert_almost_equal(df.loc[:, ('value', 'A')],
                                       np.array([
                                           -1.009421, -0.847400, -1.119885,
                                           -1.042597, -0.868819, -1.091165,
                                           -0.952307, -1.028234, -0.978774,
                                           -0.985696
                                       ]),
                                       decimal=5)
        # check pctile
        df = result.statistic('trend', 'uplift_pctile', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, ('value', 'A')],
            np.array([
                -1.137482, -0.881360, -0.970678, -0.724122, -1.245795,
                -0.993975, -1.178494, -0.906699, -0.993683, -0.743954,
                -1.225361, -0.956969, -1.082180, -0.822435, -1.151715,
                -0.904753, -1.095209, -0.862340, -1.109407, -0.861985
            ]),
            decimal=5)
        # check samplesize
        df = result.statistic('trend', 'sample_size', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, 'value'],
            np.column_stack(
                ([649, 595, 600, 590, 625, 602, 607, 608, 616,
                  616], [405, 401, 378, 362, 377, 369, 406, 392, 414, 388])),
            decimal=5)
        # check variant_mean
        df = result.statistic('trend', 'variant_mean', 'normal_shifted')
        np.testing.assert_almost_equal(df.loc[:, 'value'],
                                       np.column_stack(([
                                           0.005761, 0.057487, -0.067107,
                                           0.001125, 0.093085, -0.067894,
                                           -0.030500, -0.060996, 0.016257,
                                           -0.006091
                                       ], [
                                           1.015182, 0.904887, 1.052778,
                                           1.043721, 0.961904, 1.023271,
                                           0.921807, 0.967238, 0.995031,
                                           0.979605
                                       ])),
                                       decimal=5)

        # Perform sga() with cumulative results
        result = exp.trend()
        # check uplift
        df = result.statistic('trend', 'uplift', 'normal_shifted')

        np.testing.assert_almost_equal(df.loc[:, ('value', 'A')],
                                       np.array([
                                           -1.009421, -0.929807, -0.991088,
                                           -1.003129, -0.976023, -0.994857,
                                           -0.988167, -0.993119, -0.991571,
                                           -0.990986
                                       ]),
                                       decimal=5)
        # check pctile
        df = result.statistic('trend', 'uplift_pctile', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, ('value', 'A')],
            np.array([
                -1.137482, -0.881360, -1.018794, -0.840820, -1.063820,
                -0.918356, -1.067283, -0.938976, -1.033110, -0.918936,
                -1.047413, -0.942302, -1.036888, -0.939446, -1.038455,
                -0.947784, -1.033861, -0.949280, -1.031002, -0.950970
            ]),
            decimal=5)
        # check samplesize
        df = result.statistic('trend', 'sample_size', 'normal_shifted')
        np.testing.assert_almost_equal(
            df.loc[:, 'value'],
            np.column_stack(
                ([649, 1244, 1844, 2434, 3059, 3661, 4268, 4876, 5492, 6108],
                 [405, 806, 1184, 1546, 1923, 2292, 2698, 3090, 3504, 3892])),
            decimal=5)
        # check variant_mean
        df = result.statistic('trend', 'variant_mean', 'normal_shifted')
        np.testing.assert_almost_equal(df.loc[:, 'value'],
                                       np.column_stack(([
                                           0.005761, 0.030501, -0.001258,
                                           -0.000681, 0.018477, 0.004274,
                                           -0.000671, -0.008193, -0.005451,
                                           -0.005515
                                       ], [
                                           1.015182, 0.960308, 0.989830,
                                           1.002449, 0.994500, 0.999132,
                                           0.987496, 0.984926, 0.986120,
                                           0.985470
                                       ])),
                                       decimal=5)

        # check metadata is preserved
        np.testing.assert_equal(
            True,
            all(item in result.metadata.items()
                for item in self.testmetadata.items()))
예제 #30
0
import numpy as np
import pandas as pd
from expan.core.experiment import Experiment

###################################################

df_before = pd.read_csv('nextgen_header.csv')
print("BEFORE")
print("data size:", len(df_before))
print("number of variants:", len(df_before['variant'].unique()))
print("number of client ids:", len(df_before['entity'].unique()))

exp = Experiment(control_variant_name='control',
                 data=df_before,
                 metadata={},
                 report_kpi_names=['revenue_per_user'],
                 derived_kpis=[{
                     'name': 'revenue_per_user',
                     'formula': 'revenue/users'
                 }])
print(exp.delta())

###################################################

df_after = pd.read_csv('nextgen_header_after_removal.csv')
print("AFTER")
print("data size:", len(df_after))
print("number of variants:", len(df_after['variant'].unique()))
print("number of client ids:", len(df_after['entity'].unique()))

exp = Experiment(control_variant_name='control',
                 data=df_after,