Exemplo n.º 1
0
def test_BGBB_fitting_time():
    T = 100
    sizes = [10, 100, 1000, 10000]
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    compressed_data = {}
    for size in sizes:
        data = gen.bgbb_model(T,
                              params['alpha'],
                              params['beta'],
                              params['gamma'],
                              params['delta'],
                              size=size)
        compressed_data[size] = compress_data(data)

    times = {}
    lengths = {}
    for size in sizes:
        fitter = est.BGBBFitter()
        start_time = timeit.default_timer()
        fitter.fit(compressed_data[size]['frequency'],
                   compressed_data[size]['recency'],
                   compressed_data[size]['T'],
                   N=compressed_data[size]['N'],
                   initial_params=params.values(),
                   jac=False)
        t1 = timeit.default_timer() - start_time
        times[size] = t1
        lengths[size] = len(compressed_data[size])

    print(params)
    print(fitter.params_)
    print(times)
    print(lengths)
Exemplo n.º 2
0
def test_BGBB_fitting_compressed_or_not():
    T = 10
    size = 1000
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    compressed_data = compress_data(data)

    fitter = est.BGBBFitter()
    fitter_compressed = est.BGBBFitter()

    fitter.fit(data['frequency'],
               data['recency'],
               data['T'],
               initial_params=params.values())
    fitter_compressed.fit(compressed_data['frequency'],
                          compressed_data['recency'],
                          compressed_data['T'],
                          N=compressed_data['N'],
                          initial_params=params.values())

    print(params)
    print(fitter.params_)
    print(fitter_compressed.params_)

    for par_name in params.keys():
        assert math.fabs(fitter.params_[par_name] -
                         fitter_compressed.params_[par_name]) < 0.00001
Exemplo n.º 3
0
def test_BGBB_integration_in_models_with_uncertainties():
    T = 10
    size = 100
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    data = compress_data(data)

    model = models.BGBBModel()

    model.fit(data['frequency'],
              data['recency'],
              data['T'],
              bootstrap_size=10,
              N=data['N'],
              initial_params=params.values())

    print("Generation params")
    print(params)

    print("Fitted params")
    print(model.params)
    print(model.params_C)

    print("Uncertain parameters")
    print(model.uparams)

    print("E[X(t)] as a function of t")
    for t in [0, 1, 10, 100, 1000, 10000]:
        uEx = model.expected_number_of_purchases_up_to_time(t)
        print(t, uEx)
        assert uEx.n >= -0.0001
        assert uEx.s >= -0.0001

    t = 10
    print("E[X(t) = n] as a function of n, t = " + str(t))
    tot_prob = 0.0
    for n in range(t + 1):
        prob = model.fitter.probability_of_n_purchases_up_to_time(t, n)
        print(n, prob)
        tot_prob += prob
        assert 1 >= prob >= 0

        uprob = model.probability_of_n_purchases_up_to_time(t, n)
        print(uprob)
        assert is_almost_equal(uprob.n, prob)

    assert math.fabs(tot_prob - 1.0) < 0.00001
Exemplo n.º 4
0
def test_BGBB_fitting_with_different_T_windows():
    size = 10
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    est_params = {}

    T = range(1, 100 + 1)
    data = gen.bgbb_model(T[0],
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    for i in range(2, len(T)):
        data_addendum = gen.bgbb_model(T[i],
                                       params['alpha'],
                                       params['beta'],
                                       params['gamma'],
                                       params['delta'],
                                       size=size)
        data = data.append(data_addendum)

    data.index = range(len(data.index))
    data = compress_data(data)

    fitter = est.BGBBFitter()

    T1s = [1, 15, 30, 45, 60, 75]
    deltas = [30, 50, 60]
    for T1 in T1s:
        est_params[T1] = {}
        for delta in deltas:
            T2 = T1 + delta
            filtered_data = filter_data_by_T(data, T1, T2)
            fitter.fit(filtered_data['frequency'],
                       filtered_data['recency'],
                       filtered_data['T'],
                       N=filtered_data['N'])
            est_params[T1][delta] = fitter.params_

    print(est_params)

    for T1 in T1s:
        for delta in deltas:
            current_params = est_params[T1][delta]
            assert 'alpha' in current_params
            assert 'beta' in current_params
            assert 'gamma' in current_params
            assert 'delta' in current_params

            assert math.fabs(current_params['alpha'] -
                             params['alpha']) < 6 * params['alpha']
Exemplo n.º 5
0
def test_BGBB_additional_functions():
    T = 10
    size = 100
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    data = compress_data(data)

    fitter = est.BGBBFitter()

    fitter.fit(data['frequency'],
               data['recency'],
               data['T'],
               N=data['N'],
               initial_params=params.values())

    print("Generation params")
    print(params)

    print("Fitted params")
    print(fitter.params_)

    print("E[X(t)] as a function of t")
    for t in [0, 1, 10, 100, 1000, 10000]:
        Ex = fitter.expected_number_of_purchases_up_to_time(t)
        covariance_matrix = np.cov(
            np.vstack([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0,
                                                                  1]]))
        Ex_err = fitter.expected_number_of_purchases_up_to_time_error(
            t, covariance_matrix)
        print(t, Ex, Ex_err)
        assert Ex >= 0
        assert Ex_err >= 0

    t = 10
    print("E[X(t) = n] as a function of n, t = " + str(t))
    tot_prob = 0.0
    for n in range(t + 1):
        prob = fitter.probability_of_n_purchases_up_to_time(t, n)
        print(n, prob)
        tot_prob += prob
        assert 1 >= prob >= 0

    assert math.fabs(tot_prob - 1.0) < 0.00001
Exemplo n.º 6
0
def test_goodness_of_test_BGBB():
    params = {'alpha': 0.32, 'beta': 0.85, 'gamma': 5, 'delta': 3}

    gen_data = compress_data(gen.bgbb_model(T=sample_T, size=100, **params))
    test_n = multinomial_sample(gen_data['N'])
    test_data = gen_data.copy(deep=True)
    test_data['N'] = test_n
    assert goodness_of_test(gen_data,
                            fitter_class=est.BGBBFitter,
                            verbose=True, test_data=test_data)

    wrong_params = {'alpha': 3, 'beta': 0.1, 'gamma': 5, 'delta': 0.2}
    wrong_test_data = gen.bgbb_model(T=sample_T, size=100, compressed=True, **wrong_params)
    assert not goodness_of_test(gen_data,
                                fitter_class=est.BGBBFitter,
                                verbose=True,
                                test_data=wrong_test_data)
Exemplo n.º 7
0
def test_BGBB_correlations_preserved():
    T = 10
    size = 100
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    data = compress_data(data)

    model = models.BGBBModel()

    model.fit(data['frequency'],
              data['recency'],
              data['T'],
              bootstrap_size=10,
              N=data['N'],
              initial_params=params.values())

    print("Generation params")
    print(params)

    print("Fitted params")
    print(model.params)
    print(model.params_C)

    print("Uncertain parameters")
    print(model.uparams)

    assert is_almost_equal(
        correlation_matrix([model.uparams['alpha'],
                            model.uparams['alpha']])[0, 1], 1.0)
    assert 1.0 > correlation_matrix([
        model.uparams['alpha'] + ufloat(1, 1), model.uparams['alpha']
    ])[0, 1] > 0.0

    # stub of profile
    p1 = model.expected_number_of_purchases_up_to_time(1)
    p2 = model.expected_number_of_purchases_up_to_time(2)

    assert 1.0 > correlation_matrix([p1, p2])[0, 1] > 0.0
Exemplo n.º 8
0
def test_BGBB_likelihood_compressed(T, size):
    params = {'alpha': 0.56, 'beta': 1.17, 'gamma': 0.38, 'delta': 1.13}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    data = compress_data(data)

    model = models.BGBBModel()

    n = len(data)
    n_samples = ct.c_int(n)
    int_n_size_array = ct.c_float * n

    x = int_n_size_array(*data['frequency'])
    tx = int_n_size_array(*data['recency'])
    T = int_n_size_array(*data['T'])
    N = int_n_size_array(*data['N'])

    start_c = timeit.default_timer()

    likelihood_c = model.fitter._c_negative_log_likelihood(
        params.values(), x, tx, T, N, n_samples)
    c_time = timeit.default_timer() - start_c
    print("C_time: " + str(c_time))
    print("Likelihood: " + str(likelihood_c))

    start_py = timeit.default_timer()
    likelihood_py = model.fitter._negative_log_likelihood(params.values(),
                                                          data['frequency'],
                                                          data['recency'],
                                                          data['T'],
                                                          penalizer_coef=0,
                                                          N=N)
    py_time = timeit.default_timer() - start_py
    print("Py_time: " + str(py_time))
    print("Likelihood: " + str(likelihood_py))
Exemplo n.º 9
0
def test_BGBB_fitting_with_jacobian():
    T = 50
    size = 5000
    params = {'alpha': 1.2, 'beta': 0.7, 'gamma': 0.6, 'delta': 2.7}

    data = gen.bgbb_model(T,
                          params['alpha'],
                          params['beta'],
                          params['gamma'],
                          params['delta'],
                          size=size)

    compressed_data = compress_data(data)

    fitter = est.BGBBFitter()

    fitter.fit(compressed_data['frequency'],
               compressed_data['recency'],
               compressed_data['T'],
               N=compressed_data['N'],
               initial_params=params.values(),
               jac=False)

    print(params)
    print("Without jacobian:")
    print(fitter.params_)

    fitter.fit(compressed_data['frequency'],
               compressed_data['recency'],
               compressed_data['T'],
               N=compressed_data['N'],
               initial_params=params.values(),
               jac=True)

    print(params)
    print("With jacobian:")
    print(fitter.params_)
Exemplo n.º 10
0
def test_BGBBBGExt_fitting_on_simulated_quite_real_looking_data():
    T = 10
    T_lagged = 0
    T0 = 52

    sizes_installs = [500, 1000, 5000, 10000, 25000, 50000, 100000]  # [500, 1000]  #
    n_sim = 10
    iterative_fitting = 0
    penalizer_coef = 0.1

    success_fit_ratio = {}
    success_fit = {}
    arpd_estimates = {}
    lifetime_estimates = {}
    conversion_estimates = {}
    appd_estimates = {}
    apppu_estimates = {}
    arppu_estimates = {}

    for size_installs in sizes_installs:

        success_fit[size_installs] = 0
        arpd_estimates[size_installs] = []
        lifetime_estimates[size_installs] = []
        conversion_estimates[size_installs] = []
        appd_estimates[size_installs] = []
        apppu_estimates[size_installs] = []
        arppu_estimates[size_installs] = []

        for n in range(n_sim):

            # size_installs = 5000
            size_purchasers = size_installs / 20

            a, b, g, d, e, z, c0, a2, b2, g2, d2 = 1.13, 0.32, 0.63, 3.2, 0.05, 6.78, 0.04, 0.33, 1.75, 1.88, 7.98

            params_conversion = {'alpha': a, 'beta': b, 'gamma': g, 'delta': d, 'epsilon': e, 'zeta': z, 'c0': c0}
            params_arppu = {'alpha': a2, 'beta': b2, 'gamma': g2, 'delta': d2}

            n_cohorts = T - T_lagged + 1

            data_conversion = gen.bgbbbgext_model(T, params_conversion['alpha'], params_conversion['beta'],
                                                  params_conversion['gamma'], params_conversion['delta'],
                                                  params_conversion['epsilon'],
                                                  params_conversion['zeta'], params_conversion['c0'],
                                                  size=size_installs / n_cohorts,
                                                  time_first_purchase=True)

            data_arppu = gen.bgbb_model(T, params_arppu['alpha'], params_arppu['beta'],
                                        params_arppu['gamma'], params_arppu['delta'],
                                        size=size_purchasers / n_cohorts)

            for Ti in range(T_lagged, T):
                data_conversion_new = gen.bgbbbgext_model(Ti, params_conversion['alpha'], params_conversion['beta'],
                                                          params_conversion['gamma'], params_conversion['delta'],
                                                          params_conversion['epsilon'],
                                                          params_conversion['zeta'], params_conversion['c0'],
                                                          size=size_installs / n_cohorts,
                                                          time_first_purchase=True)
                data_arppu_new = gen.bgbb_model(Ti, params_arppu['alpha'], params_arppu['beta'],
                                                params_arppu['gamma'], params_arppu['delta'],
                                                size=size_purchasers / n_cohorts)
                data_conversion = pd.concat([data_conversion, data_conversion_new])
                data_arppu = pd.concat([data_arppu, data_arppu_new])

            mv_values = gen.sample_monetary_values(size_purchasers)

            compressed_data_conversion = compress_session_session_before_conversion_data(data_conversion)
            compressed_data_arppu = compress_data(data_arppu)

            model_conversion = mod.BGBBBGExtModel(penalizer_coef)
            model_conversion.fit(frequency=compressed_data_conversion['frequency'],
                                 recency=compressed_data_conversion['recency'], T=compressed_data_conversion['T'],
                                 frequency_before_conversion=compressed_data_conversion['frequency_before_conversion'],
                                 N=compressed_data_conversion['N'], initial_params=params_conversion.values(),
                                 iterative_fitting=iterative_fitting)

            model_arppu = mod.BGBBModel(penalizer_coef)
            model_arppu.fit(frequency=compressed_data_arppu['frequency'], recency=compressed_data_arppu['recency'],
                            T=compressed_data_arppu['T'],
                            N=compressed_data_arppu['N'], initial_params=params_arppu.values(),
                            iterative_fitting=iterative_fitting)

            mv = ufloat(np.mean(mv_values), np.std(mv_values) / math.sqrt(len(mv_values)))

            print("Conversion parameters")
            print(params_conversion)
            print(model_conversion.params)
            print(model_conversion.uparams)

            print("Arppu parameters")
            print(params_arppu)
            print(model_arppu.params)
            print(model_arppu.uparams)

            print("Monetary values")
            print(mv)

            ts = range(T0)
            lifetime = [ufloat_to_tuple(model_conversion.expected_number_of_sessions_up_to_time(t)) for t in ts]
            conversion_diff = [ufloat_to_tuple(model_conversion.expected_probability_of_converting_at_time(t)) for t in ts]
            conversion = [ufloat_to_tuple(model_conversion.expected_probability_of_converting_within_time(t)) for t in ts]
            apppu = [ufloat_to_tuple(model_arppu.expected_number_of_purchases_up_to_time(t)) for t in ts]
            arppu = [ufloat_to_tuple((1.0 + apppu[i][0]) * mv) for i in range(len(apppu))]
            appd = [ufloat_to_tuple(get_arpd_retention(model_conversion, model_arppu, t)) for t in ts]
            arpd = [ufloat_to_tuple(appd[i][0] * mv) for i in range(len(appd))]

            print(ts)
            print(lifetime)
            print(conversion_diff)
            print(conversion)
            print(apppu)
            print(arppu)
            print(appd)
            print(arpd)

            summary_df = pd.DataFrame({
                'lifetime': [v[0] + 1 for v in lifetime],
                'lifetime_err': [v[1] for v in lifetime],
                'conversion_diff': [v[0] for v in conversion_diff],
                'conversion_diff_err': [v[1] for v in conversion_diff],
                'conversion': [v[0] for v in conversion],
                'conversion_err': [v[1] for v in conversion],
                'apppu': [v[0] + 1 for v in apppu],
                'apppu_err': [v[1] for v in apppu],
                'arppu': [v[0] for v in arppu],
                'arppu_err': [v[1] for v in arppu],
                'appd': [v[0] for v in appd],
                'appd_err': [v[1] for v in appd],
                'arpd': [v[0] for v in arpd],
                'arpd_err': [v[1] for v in arpd],
                'true_lifetime': [get_true_lifetime(a, b, g, d, t) for t in ts],
                'true_conversion': [get_true_conversion(a, b, g, d, e, z, c0, t) for t in ts],
                'true_apppu': [get_true_apppu(a2, b2, g2, d2, t) for t in ts],
                'true_arppu': [get_true_arppu(a2, b2, g2, d2, true_mv, t) for t in ts],
                'true_appd': [get_true_appd(a, b, g, d, e, z, c0, a2, b2, g2, d2, t) for t in ts],
                'true_arpd': [get_true_arpd(a, b, g, d, e, z, c0, a2, b2, g2, d2, true_mv, t) for t in ts],
            })

            with open("/Users/marcomeneghelli/Desktop/arpd_simulations/" + str(size_installs) + "/pars_simdata_" + str(
                    size_installs) + "_" + str(
                T) + "iterative_fitting" + str(iterative_fitting) + "_" + str(n) + ".txt", "w") as text_file:
                text_file.write(str(model_conversion.params))
                text_file.write(str(model_arppu.params))
                text_file.write(str((mv)))

            summary_df.to_csv(
                "/Users/marcomeneghelli/Desktop/arpd_simulations/" + str(size_installs) + "/arpd_simdata_" + str(
                    size_installs) + "_" + str(
                    T) + "iterative_fitting" + str(iterative_fitting) + "_" + str(n) + ".csv")

            last_arpd = arpd[-1][0]
            last_lifetime = lifetime[-1][0]
            last_conversion = conversion[-1][0]
            last_appd = appd[-1][0]
            last_apppu = apppu[-1][0]
            last_arppu = arppu[-1][0]

            if not (math.isnan(last_arpd) or last_arpd is None):
                success_fit[size_installs] += 1

            arpd_estimates[size_installs].append(last_arpd)
            lifetime_estimates[size_installs].append(last_lifetime)
            conversion_estimates[size_installs].append(last_conversion)
            appd_estimates[size_installs].append(last_appd)
            apppu_estimates[size_installs].append(last_apppu)
            arppu_estimates[size_installs].append(last_arppu)

        success_fit_ratio[size_installs] = float(success_fit[size_installs]) / n_sim

        summary_size_installs_df = pd.DataFrame({
            'success_fit_ratio': success_fit_ratio[size_installs],
            'arpd_estimates': arpd_estimates[size_installs],
            'lifetime_estimates': lifetime_estimates[size_installs],
            'conversion_estimates': conversion_estimates[size_installs],
            'appd_estimates': appd_estimates[size_installs],
            'apppu_estimates': apppu_estimates[size_installs],
            'arppu_estimates': arppu_estimates[size_installs],
        })

        summary_size_installs_df.to_csv(
            "/Users/marcomeneghelli/Desktop/arpd_simulations/arpd_simdata_last_measurements_" + str(
                size_installs) + "_" + str(
                T) + "iterative_fitting" + str(iterative_fitting) + ".csv")