示例#1
0
    def test_drop_nan(self):
        nan = float('nan')  # NaN

        array_with_nan_1d = np.array([nan, nan, 1, 2, 3])
        returned_array_1d = util.drop_nan(array_with_nan_1d)
        self.assertTrue(all(returned_array_1d == np.array([1, 2, 3])))

        array_with_nan_2d = np.array([[nan], [nan], [1], [2], [3]])
        returned_array_2d = util.drop_nan(array_with_nan_2d)
        print(returned_array_2d)
        self.assertTrue(all(returned_array_2d == np.array([[1], [2], [3]])))

        array_without_nan_1d = np.array([1, 2])
        returned_array = util.drop_nan(array_without_nan_1d)
        self.assertTrue(all(returned_array == np.array([1, 2])))
示例#2
0
    def test_drop_nan(self):
        nan = float('nan')   # NaN

        array_with_nan_1d = np.array([nan, nan, 1, 2, 3])
        returned_array_1d = util.drop_nan(array_with_nan_1d)
        self.assertTrue(all(returned_array_1d == np.array([1, 2, 3])))

        array_with_nan_2d = np.array([[nan], [nan], [1], [2], [3]])
        returned_array_2d = util.drop_nan(array_with_nan_2d)
        print(returned_array_2d)
        self.assertTrue(all(returned_array_2d == np.array([[1], [2], [3]])))

        array_without_nan_1d = np.array([1, 2])
        returned_array = util.drop_nan(array_without_nan_1d)
        self.assertTrue(all(returned_array == np.array([1, 2])))
示例#3
0
def _bayes_sampling(x,
                    y,
                    distribution='normal',
                    num_iters=25000,
                    inference="sampling"):
    """ Helper function for bayesian sampling.

    :param x: sample of a treatment group
    :type  x: pd.Series or list (array-like)
    :param y: sample of a control group
    :type  y: pd.Series or list (array-like)
    :param distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists
    :type  distribution: str
    :param num_iters: number of iterations of sampling
    :type  num_iters: int
    :param inference: 'sampling' for MCMC sampling method or 'variational' for variational inference
    :type  inference: str

    :return: the posterior samples, sample size of x, sample size of y, absolute mean of x, absolute mean of y
    :rtype:  tuple[array-like, array-like, array-like, float, float]
    """
    # Checking if data was provided and it has correct format
    if x is None or y is None:
        raise ValueError('Please provide two non-empty samples.')
    if not isinstance(x, pd.Series) and not isinstance(
            x, np.ndarray) and not isinstance(x, list):
        raise TypeError('Please provide samples of type Series or list.')
    if type(x) != type(y):
        raise TypeError('Please provide samples of the same type.')

    logger.info(
        "Started running bayesian inference with {} procedure, treatment group of size {}, "
        "control group of size {}, {} distribution.".format(
            inference, len(x), len(y), distribution, inference))

    # Coercing missing values to right format
    _x = np.array(x, dtype=float)
    _y = np.array(y, dtype=float)
    _x = drop_nan(_x)
    _y = drop_nan(_y)

    key = (str(_x), str(_y), num_iters, inference)

    if cache_sampling_results and key in sampling_results:
        return sampling_results[key]

    mu_x = np.nanmean(_x)
    mu_y = np.nanmean(_y)
    n_x = statx.sample_size(_x)
    n_y = statx.sample_size(_y)

    if distribution == 'normal':
        fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y}
    elif distribution == 'poisson':
        fit_data = {
            'Nc': n_y,
            'Nt': n_x,
            'x': _x.astype(int),
            'y': _y.astype(int)
        }
    else:
        raise NotImplementedError

    model_file = __location__ + '/../models/' + distribution + '_kpi.stan'

    sm = get_or_compile_stan_model(model_file, distribution)

    if inference == "sampling":
        fit = sm.sampling(data=fit_data,
                          iter=num_iters,
                          chains=4,
                          n_jobs=1,
                          seed=1,
                          control={
                              'stepsize': 0.01,
                              'adapt_delta': 0.99
                          })
        traces = fit.extract()

    elif inference == "variational":
        results_dict = sm.vb(data=fit_data, iter=10000)
        traces = {}
        for i in range(len(results_dict['sampler_param_names'])):
            para_name = results_dict['sampler_param_names'][i]
            para_values = np.array(results_dict['sampler_params'][i])
            traces[para_name] = para_values

    if cache_sampling_results:
        sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y)

    logger.info(
        "Finished running bayesian inference with {} procedure, treatment group of size {}, "
        "control group of size {}, {} distribution.".format(
            inference, len(x), len(y), distribution))
    return traces, n_x, n_y, mu_x, mu_y
示例#4
0
def _bayes_sampling(x, y, distribution='normal', num_iters=25000, inference="sampling"):
    """ Helper function for bayesian sampling.

    :param x: sample of a treatment group
    :type  x: pd.Series or list (array-like)
    :param y: sample of a control group
    :type  y: pd.Series or list (array-like)
    :param distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists
    :type  distribution: str
    :param num_iters: number of iterations of sampling
    :type  num_iters: int
    :param inference: 'sampling' for MCMC sampling method or 'variational' for variational inference
    :type  inference: str

    :return: the posterior samples, sample size of x, sample size of y, absolute mean of x, absolute mean of y
    :rtype:  tuple[array-like, array-like, array-like, float, float]
    """
    # Checking if data was provided and it has correct format
    if x is None or y is None:
        raise ValueError('Please provide two non-empty samples.')
    if not isinstance(x, pd.Series) and not isinstance(x, np.ndarray) and not isinstance(x, list):
        raise TypeError('Please provide samples of type Series or list.')
    if type(x) != type(y):
        raise TypeError('Please provide samples of the same type.')

    logger.info("Started running bayesian inference with {} procedure, treatment group of size {}, "
                "control group of size {}, {} distribution.".format(inference, len(x), len(y), distribution, inference))

    # Coercing missing values to right format
    _x = np.array(x, dtype=float)
    _y = np.array(y, dtype=float)
    _x = drop_nan(_x)
    _y = drop_nan(_y)

    key = (str(_x), str(_y), num_iters, inference)

    if cache_sampling_results and key in sampling_results:
        return sampling_results[key]

    mu_x = np.nanmean(_x)
    mu_y = np.nanmean(_y)
    n_x = statx.sample_size(_x)
    n_y = statx.sample_size(_y)

    if distribution == 'normal':
        fit_data = {'Nc': n_y,
                    'Nt': n_x,
                    'x': _x,
                    'y': _y}
    elif distribution == 'poisson':
        fit_data = {'Nc': n_y,
                    'Nt': n_x,
                    'x': _x.astype(int),
                    'y': _y.astype(int)}
    else:
        raise NotImplementedError

    model_file = __location__ + '/../models/' + distribution + '_kpi.stan'

    sm = get_or_compile_stan_model(model_file, distribution)

    if inference == "sampling":
        fit = sm.sampling(data=fit_data, iter=num_iters, chains=4, n_jobs=1, seed=1,
                          control={'stepsize': 0.01, 'adapt_delta': 0.99})
        traces = fit.extract()

    elif inference == "variational":
        results_dict = sm.vb(data=fit_data, iter=10000)
        traces = {}
        for i in range(len(results_dict['sampler_param_names'])):
            para_name = results_dict['sampler_param_names'][i]
            para_values = np.array(results_dict['sampler_params'][i])
            traces[para_name] = para_values

    if cache_sampling_results:
        sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y)

    logger.info("Finished running bayesian inference with {} procedure, treatment group of size {}, "
                "control group of size {}, {} distribution.".format(inference, len(x), len(y), distribution))
    return traces, n_x, n_y, mu_x, mu_y
示例#5
0
def _bayes_sampling(x,
                    y,
                    distribution='normal',
                    num_iters=25000,
                    inference="sampling"):
    """
    Helper function.

    Args:
        x (array_like): sample of a treatment group
        y (array_like): sample of a control group
        distribution: name of the KPI distribution model, which assumes a
            Stan model file with the same name exists
        num_iters: number of iterations of sampling

    Returns:
        tuple:
            - the posterior samples
            - sample size of x
            - sample size of y
            - absolute mean of x
            - absolute mean of y
    """
    # Checking if data was provided
    if x is None or y is None:
        raise ValueError('Please provide two non-None samples.')

    # Coercing missing values to right format
    _x = np.array(x, dtype=float)
    _y = np.array(y, dtype=float)
    _x = drop_nan(_x)
    _y = drop_nan(_y)

    key = (str(_x), str(_y), num_iters, inference)

    if cache_sampling_results and key in sampling_results:
        return sampling_results[key]

    mu_x = np.nanmean(_x)
    mu_y = np.nanmean(_y)
    n_x = statx.sample_size(_x)
    n_y = statx.sample_size(_y)

    if distribution == 'normal':
        fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y}
    elif distribution == 'poisson':
        fit_data = {
            'Nc': n_y,
            'Nt': n_x,
            'x': _x.astype(int),
            'y': _y.astype(int)
        }
    else:
        raise NotImplementedError

    model_file = __location__ + '/../models/' + distribution + '_kpi.stan'

    sm = get_or_compile_stan_model(model_file, distribution)

    if inference == "sampling":
        fit = sm.sampling(data=fit_data,
                          iter=num_iters,
                          chains=4,
                          n_jobs=1,
                          seed=1,
                          control={
                              'stepsize': 0.01,
                              'adapt_delta': 0.99
                          })
        traces = fit.extract()

    elif inference == "variational":
        results_dict = sm.vb(data=fit_data, iter=10000)
        traces = {}
        for i in range(len(results_dict['sampler_param_names'])):
            para_name = results_dict['sampler_param_names'][i]
            para_values = np.array(results_dict['sampler_params'][i])
            traces[para_name] = para_values

    if cache_sampling_results:
        sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y)

    return traces, n_x, n_y, mu_x, mu_y