def test_drop_nan(self): nan = float('nan') # NaN array_with_nan_1d = np.array([nan, nan, 1, 2, 3]) returned_array_1d = util.drop_nan(array_with_nan_1d) self.assertTrue(all(returned_array_1d == np.array([1, 2, 3]))) array_with_nan_2d = np.array([[nan], [nan], [1], [2], [3]]) returned_array_2d = util.drop_nan(array_with_nan_2d) print(returned_array_2d) self.assertTrue(all(returned_array_2d == np.array([[1], [2], [3]]))) array_without_nan_1d = np.array([1, 2]) returned_array = util.drop_nan(array_without_nan_1d) self.assertTrue(all(returned_array == np.array([1, 2])))
def _bayes_sampling(x, y, distribution='normal', num_iters=25000, inference="sampling"): """ Helper function for bayesian sampling. :param x: sample of a treatment group :type x: pd.Series or list (array-like) :param y: sample of a control group :type y: pd.Series or list (array-like) :param distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists :type distribution: str :param num_iters: number of iterations of sampling :type num_iters: int :param inference: 'sampling' for MCMC sampling method or 'variational' for variational inference :type inference: str :return: the posterior samples, sample size of x, sample size of y, absolute mean of x, absolute mean of y :rtype: tuple[array-like, array-like, array-like, float, float] """ # Checking if data was provided and it has correct format if x is None or y is None: raise ValueError('Please provide two non-empty samples.') if not isinstance(x, pd.Series) and not isinstance( x, np.ndarray) and not isinstance(x, list): raise TypeError('Please provide samples of type Series or list.') if type(x) != type(y): raise TypeError('Please provide samples of the same type.') logger.info( "Started running bayesian inference with {} procedure, treatment group of size {}, " "control group of size {}, {} distribution.".format( inference, len(x), len(y), distribution, inference)) # Coercing missing values to right format _x = np.array(x, dtype=float) _y = np.array(y, dtype=float) _x = drop_nan(_x) _y = drop_nan(_y) key = (str(_x), str(_y), num_iters, inference) if cache_sampling_results and key in sampling_results: return sampling_results[key] mu_x = np.nanmean(_x) mu_y = np.nanmean(_y) n_x = statx.sample_size(_x) n_y = statx.sample_size(_y) if distribution == 'normal': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y} elif distribution == 'poisson': fit_data = { 'Nc': n_y, 'Nt': n_x, 'x': _x.astype(int), 'y': _y.astype(int) } else: raise NotImplementedError model_file = __location__ + '/../models/' + distribution + '_kpi.stan' sm = get_or_compile_stan_model(model_file, distribution) if inference == "sampling": fit = sm.sampling(data=fit_data, iter=num_iters, chains=4, n_jobs=1, seed=1, control={ 'stepsize': 0.01, 'adapt_delta': 0.99 }) traces = fit.extract() elif inference == "variational": results_dict = sm.vb(data=fit_data, iter=10000) traces = {} for i in range(len(results_dict['sampler_param_names'])): para_name = results_dict['sampler_param_names'][i] para_values = np.array(results_dict['sampler_params'][i]) traces[para_name] = para_values if cache_sampling_results: sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y) logger.info( "Finished running bayesian inference with {} procedure, treatment group of size {}, " "control group of size {}, {} distribution.".format( inference, len(x), len(y), distribution)) return traces, n_x, n_y, mu_x, mu_y
def _bayes_sampling(x, y, distribution='normal', num_iters=25000, inference="sampling"): """ Helper function for bayesian sampling. :param x: sample of a treatment group :type x: pd.Series or list (array-like) :param y: sample of a control group :type y: pd.Series or list (array-like) :param distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists :type distribution: str :param num_iters: number of iterations of sampling :type num_iters: int :param inference: 'sampling' for MCMC sampling method or 'variational' for variational inference :type inference: str :return: the posterior samples, sample size of x, sample size of y, absolute mean of x, absolute mean of y :rtype: tuple[array-like, array-like, array-like, float, float] """ # Checking if data was provided and it has correct format if x is None or y is None: raise ValueError('Please provide two non-empty samples.') if not isinstance(x, pd.Series) and not isinstance(x, np.ndarray) and not isinstance(x, list): raise TypeError('Please provide samples of type Series or list.') if type(x) != type(y): raise TypeError('Please provide samples of the same type.') logger.info("Started running bayesian inference with {} procedure, treatment group of size {}, " "control group of size {}, {} distribution.".format(inference, len(x), len(y), distribution, inference)) # Coercing missing values to right format _x = np.array(x, dtype=float) _y = np.array(y, dtype=float) _x = drop_nan(_x) _y = drop_nan(_y) key = (str(_x), str(_y), num_iters, inference) if cache_sampling_results and key in sampling_results: return sampling_results[key] mu_x = np.nanmean(_x) mu_y = np.nanmean(_y) n_x = statx.sample_size(_x) n_y = statx.sample_size(_y) if distribution == 'normal': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y} elif distribution == 'poisson': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x.astype(int), 'y': _y.astype(int)} else: raise NotImplementedError model_file = __location__ + '/../models/' + distribution + '_kpi.stan' sm = get_or_compile_stan_model(model_file, distribution) if inference == "sampling": fit = sm.sampling(data=fit_data, iter=num_iters, chains=4, n_jobs=1, seed=1, control={'stepsize': 0.01, 'adapt_delta': 0.99}) traces = fit.extract() elif inference == "variational": results_dict = sm.vb(data=fit_data, iter=10000) traces = {} for i in range(len(results_dict['sampler_param_names'])): para_name = results_dict['sampler_param_names'][i] para_values = np.array(results_dict['sampler_params'][i]) traces[para_name] = para_values if cache_sampling_results: sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y) logger.info("Finished running bayesian inference with {} procedure, treatment group of size {}, " "control group of size {}, {} distribution.".format(inference, len(x), len(y), distribution)) return traces, n_x, n_y, mu_x, mu_y
def _bayes_sampling(x, y, distribution='normal', num_iters=25000, inference="sampling"): """ Helper function. Args: x (array_like): sample of a treatment group y (array_like): sample of a control group distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists num_iters: number of iterations of sampling Returns: tuple: - the posterior samples - sample size of x - sample size of y - absolute mean of x - absolute mean of y """ # Checking if data was provided if x is None or y is None: raise ValueError('Please provide two non-None samples.') # Coercing missing values to right format _x = np.array(x, dtype=float) _y = np.array(y, dtype=float) _x = drop_nan(_x) _y = drop_nan(_y) key = (str(_x), str(_y), num_iters, inference) if cache_sampling_results and key in sampling_results: return sampling_results[key] mu_x = np.nanmean(_x) mu_y = np.nanmean(_y) n_x = statx.sample_size(_x) n_y = statx.sample_size(_y) if distribution == 'normal': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y} elif distribution == 'poisson': fit_data = { 'Nc': n_y, 'Nt': n_x, 'x': _x.astype(int), 'y': _y.astype(int) } else: raise NotImplementedError model_file = __location__ + '/../models/' + distribution + '_kpi.stan' sm = get_or_compile_stan_model(model_file, distribution) if inference == "sampling": fit = sm.sampling(data=fit_data, iter=num_iters, chains=4, n_jobs=1, seed=1, control={ 'stepsize': 0.01, 'adapt_delta': 0.99 }) traces = fit.extract() elif inference == "variational": results_dict = sm.vb(data=fit_data, iter=10000) traces = {} for i in range(len(results_dict['sampler_param_names'])): para_name = results_dict['sampler_param_names'][i] para_values = np.array(results_dict['sampler_params'][i]) traces[para_name] = para_values if cache_sampling_results: sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y) return traces, n_x, n_y, mu_x, mu_y