示例#1
0
def test_calibration_and_holdout_data_throws_better_error_if_observation_period_end_is_too_early(
        large_transaction_level_data):
    # max date is 2015-02-02
    today = "2014-02-07"
    calibration_end = "2014-02-01"

    with pytest.raises(ValueError, match="There is no data available"):
        utils.calibration_and_holdout_data(large_transaction_level_data,
                                           "id",
                                           "date",
                                           calibration_end,
                                           observation_period_end=today)
示例#2
0
    def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self, transaction_data, bgf):
        summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal'])

        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind='time_since_last_purchase')
        return plt.gcf()
示例#3
0
    def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(
            self, transaction_data, bgf):
        holdout_expected = [3.954, 3.431, 3.482, 3.484, 2.75, 2.289, 1.968]
        predictions_expected = [
            4.345, 2.993, 3.236, 2.677, 2.240, 2.608, 2.430
        ]
        labels = ['frequency_holdout', 'model_predictions']

        summary = utils.calibration_and_holdout_data(transaction_data, 'id',
                                                     'date', '2014-09-01',
                                                     '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'],
                summary['T_cal'])

        ax = plotting.plot_calibration_purchases_vs_holdout_purchases(
            bgf, summary, kind='time_since_last_purchase')

        lines = ax.lines
        legend = ax.legend_
        holdout = lines[0].get_data()[1]
        predictions = lines[1].get_data()[1]

        assert_allclose(holdout, holdout_expected, atol=0.01)
        assert_allclose(predictions, predictions_expected, atol=0.01)
        assert_array_equal([e.get_text() for e in legend.get_texts()], labels)
        assert_equal(
            ax.title.get_text(),
            "Actual Purchases in Holdout Period vs Predicted Purchases")
        assert_equal(ax.xaxis.get_label().get_text(),
                     "Time since user made last purchase")
        assert_equal(ax.yaxis.get_label().get_text(),
                     "Average of Purchases in Holdout Period")
        plt.close()
示例#4
0
    def test_plot_calibration_purchases_vs_holdout_purchases(
            self, transaction_data, bgf):
        holdout_expected = [0.161, 0.233, 0.348, 0.544, 0.710, 0.704, 1.606]
        predictions_expected = [
            0.270, 0.294, 0.402, 0.422, 0.706, 0.809, 1.019
        ]
        labels = ['frequency_holdout', 'model_predictions']

        summary = utils.calibration_and_holdout_data(transaction_data, 'id',
                                                     'date', '2014-09-01',
                                                     '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'],
                summary['T_cal'])

        ax = plotting.plot_calibration_purchases_vs_holdout_purchases(
            bgf, summary)

        lines = ax.lines
        legend = ax.legend_
        holdout = lines[0].get_data()[1]
        predictions = lines[1].get_data()[1]

        assert_allclose(holdout, holdout_expected, atol=0.01)
        assert_allclose(predictions, predictions_expected, atol=0.01)
        assert_array_equal([e.get_text() for e in legend.get_texts()], labels)
        assert_equal(
            ax.title.get_text(),
            "Actual Purchases in Holdout Period vs Predicted Purchases")
        assert_equal(ax.xaxis.get_label().get_text(),
                     "Purchases in calibration period")
        assert_equal(ax.yaxis.get_label().get_text(),
                     "Average of Purchases in Holdout Period")
        plt.close()
示例#5
0
def test_calibration_and_holdout_data_gives_correct_date_boundaries():

    d = [
        [1, "2015-01-01"],
        [1, "2015-02-06"],  # excluded from both holdout and calibration
        [2, "2015-01-01"],
        [3, "2015-01-01"],
        [3, "2015-01-02"],
        [3, "2015-01-05"],
        [4, "2015-01-16"],
        [4, "2015-02-02"],
        [4, "2015-02-05"],  # excluded from both holdout and calibration
        [5, "2015-01-16"],
        [5, "2015-01-17"],
        [5, "2015-01-18"],
        [6, "2015-02-02"],
    ]
    transactions = pd.DataFrame(d, columns=["id", "date"])
    actual = utils.calibration_and_holdout_data(
        transactions,
        "id",
        "date",
        calibration_period_end="2015-02-01",
        observation_period_end="2015-02-04")
    assert actual["frequency_holdout"].loc[1] == 0
    assert actual["frequency_holdout"].loc[4] == 1
示例#6
0
def test_calibration_and_holdout_data_works_with_specific_frequency(
        large_transaction_level_data):
    today = "2015-02-07"
    calibration_end = "2015-02-01"
    actual = utils.calibration_and_holdout_data(large_transaction_level_data,
                                                "id",
                                                "date",
                                                calibration_end,
                                                observation_period_end=today,
                                                freq="W")
    expected_cols = [
        "id", "frequency_cal", "recency_cal", "T_cal", "frequency_holdout",
        "duration_holdout"
    ]
    expected = pd.DataFrame(
        [
            [1, 0.0, 0.0, 4.0, 1, 1],
            [2, 0.0, 0.0, 4.0, 0, 1],
            [3, 1.0, 1.0, 4.0, 0, 1],
            [4, 0.0, 0.0, 2.0, 1, 1],
            [5, 0.0, 0.0, 2.0, 0, 1],
        ],
        columns=expected_cols,
    ).set_index("id")
    assert_frame_equal(actual, expected, check_dtype=False)
示例#7
0
def test_calibration_and_holdout_data_gives_correct_date_boundaries():

    d = [
        [1, '2015-01-01'],
        [1, '2015-02-06'],  # excluded from both holdout and calibration
        [2, '2015-01-01'],
        [3, '2015-01-01'],
        [3, '2015-01-02'],
        [3, '2015-01-05'],
        [4, '2015-01-16'],
        [4, '2015-02-02'],
        [4, '2015-02-05'],  # excluded from both holdout and calibration
        [5, '2015-01-16'],
        [5, '2015-01-17'],
        [5, '2015-01-18'],
        [6, '2015-02-02'],
    ]
    transactions = pd.DataFrame(d, columns=['id', 'date'])
    actual = utils.calibration_and_holdout_data(
        transactions,
        'id',
        'date',
        calibration_period_end='2015-02-01',
        observation_period_end='2015-02-04')
    assert actual['frequency_holdout'].ix[1] == 0
    assert actual['frequency_holdout'].ix[4] == 1
示例#8
0
    def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self):
        transaction_data = load_transaction_data()
        summary = utils.calibration_and_holdout_data(transaction_data, "id", "date", "2014-09-01", "2014-12-31")
        bgf.fit(summary["frequency_cal"], summary["recency_cal"], summary["T_cal"])

        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind="time_since_last_purchase")
        return plt.gcf()
示例#9
0
    def test_plot_calibration_purchases_vs_holdout_purchases_time_since_last_purchase(self):
        transaction_data = load_transaction_data()
        summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal'])

        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary, kind='time_since_last_purchase')
        return plt.gcf()
示例#10
0
    def test_plot_calibration_purchases_vs_holdout_purchases(self):
        transaction_data = load_transaction_data()
        summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal'])

        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary)
        return plt.gcf()
示例#11
0
def test_calibration_and_holdout_data(large_transaction_level_data):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data, 'id', 'date', calibration_end, observation_period_end=today)
    assert actual.ix[1]['frequency_holdout'] == 1
    assert actual.ix[2]['frequency_holdout'] == 0

    with pytest.raises(KeyError):
        actual.ix[6] 
示例#12
0
def test_calibration_and_holdout_data(large_transaction_level_data):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data, 'id', 'date', calibration_end, observation_period_end=today)
    assert actual.ix[1]['frequency_holdout'] == 1
    assert actual.ix[2]['frequency_holdout'] == 0

    with pytest.raises(KeyError):
        actual.ix[6] 
    def test_plot_calibration_purchases_vs_holdout_purchases(self):
        from matplotlib import pyplot as plt 

        transaction_data = load_transaction_data()
        summary = utils.calibration_and_holdout_data(transaction_data, 'id', 'date', '2014-09-01', '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'], summary['T_cal'])
        
        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary)
        plt.show()
示例#14
0
def test_calibration_and_holdout_data_with_monetary_value(large_transaction_level_data_with_monetary_value):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data_with_monetary_value,
                                                'id',
                                                'date',
                                                calibration_end,
                                                observation_period_end=today,
                                                monetary_value_col='monetary_value')
    assert (actual['monetary_value_cal'] == [0, 0, 3, 0, 4.5]).all()
    assert (actual['monetary_value_holdout'] == [2, 0, 0, 3, 0]).all()
示例#15
0
def test_calibration_and_holdout_data_works_with_specific_frequency(large_transaction_level_data):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data, 'id', 'date', calibration_end, observation_period_end=today, freq='W')
    expected_cols = ['id', 'frequency_cal', 'recency_cal', 'T_cal', 'frequency_holdout', 'duration_holdout']
    expected = pd.DataFrame([[1, 0., 0., 4., 1, 1],
                             [2, 0., 0., 4., 0, 1],
                             [3, 1., 1., 4., 0, 1],
                             [4, 0., 0., 2., 1, 1],
                             [5, 0., 0., 2., 0, 1]], columns=expected_cols).set_index('id')
    assert_frame_equal(actual, expected, check_dtype=False)
示例#16
0
def test_calibration_and_holdout_data_with_monetary_value(large_transaction_level_data_with_monetary_value):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data_with_monetary_value,
                                                'id',
                                                'date',
                                                calibration_end,
                                                observation_period_end=today,
                                                monetary_value_col='monetary_value')
    assert (actual['monetary_value_cal'] == [0, 0, 3, 0, 4.5]).all()
    assert (actual['monetary_value_holdout'] == [2, 0, 0, 3, 0]).all()
示例#17
0
def test_calibration_and_holdout_data_works_with_specific_frequency(large_transaction_level_data):
    today = '2015-02-07'
    calibration_end = '2015-02-01'
    actual = utils.calibration_and_holdout_data(large_transaction_level_data, 'id', 'date', calibration_end, observation_period_end=today, freq='W')
    expected_cols = ['id', 'frequency_cal', 'recency_cal', 'T_cal', 'frequency_holdout', 'duration_holdout']
    expected = pd.DataFrame([[1, 0., 0., 4., 1, 1],
                             [2, 0., 0., 4., 0, 1],
                             [3, 1., 1., 4., 0, 1],
                             [4, 0., 0., 2., 1, 1],
                             [5, 0., 0., 2., 0, 1]], columns=expected_cols).set_index('id')
    assert_frame_equal(actual, expected, check_dtype=False)
示例#18
0
    def test_plot_calibration_purchases_vs_holdout_purchases(self):
        from matplotlib import pyplot as plt

        transaction_data = load_transaction_data()
        summary = utils.calibration_and_holdout_data(transaction_data, 'id',
                                                     'date', '2014-09-01',
                                                     '2014-12-31')
        bgf.fit(summary['frequency_cal'], summary['recency_cal'],
                summary['T_cal'])

        plt.figure()
        plotting.plot_calibration_purchases_vs_holdout_purchases(bgf, summary)
        plt.show()
示例#19
0
def test_calibration_and_holdout_data_is_okay_with_other_indexes(
        large_transaction_level_data):
    n = large_transaction_level_data.shape[0]
    large_transaction_level_data.index = np.random.randint(0, n, size=n)
    today = "2015-02-07"
    calibration_end = "2015-02-01"
    actual = utils.calibration_and_holdout_data(large_transaction_level_data,
                                                "id",
                                                "date",
                                                calibration_end,
                                                observation_period_end=today)
    assert actual.loc[1]["frequency_holdout"] == 1
    assert actual.loc[2]["frequency_holdout"] == 0
示例#20
0
def test_calibration_and_holdout_data(large_transaction_level_data):
    today = "2015-02-07"
    calibration_end = "2015-02-01"
    actual = utils.calibration_and_holdout_data(large_transaction_level_data,
                                                "id",
                                                "date",
                                                calibration_end,
                                                observation_period_end=today)
    assert actual.loc[1]["frequency_holdout"] == 1
    assert actual.loc[2]["frequency_holdout"] == 0

    with pytest.raises(KeyError):
        actual.loc[6]
    def calibrate_bgf(self, calib_end_date, period_end_date, viz=False):
        '''
        Visualize the goodness of fit of BGF model
        '''
        summary_cal_holdout = calibration_and_holdout_data(self.transaction_data, 'CustomerNo', 'OrderDate',
                                            calibration_period_end=calib_end_date, #use 75% of data for training
                                            observation_period_end=period_end_date )
        if viz==True:
            print(summary_cal_holdout.head())

        self.bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
        plot_calibration_purchases_vs_holdout_purchases(self.bgf, summary_cal_holdout, colormap='coolwarm', alpha=0.75)
        plt.savefig('calibration_purchases_vs_holdout_purchases.png')
        plt.close()
示例#22
0
def test_calibration_and_holdout_data_with_monetary_value(
        large_transaction_level_data_with_monetary_value):
    today = "2015-02-07"
    calibration_end = "2015-02-01"
    actual = utils.calibration_and_holdout_data(
        large_transaction_level_data_with_monetary_value,
        "id",
        "date",
        calibration_end,
        observation_period_end=today,
        monetary_value_col="monetary_value",
    )
    assert (actual["monetary_value_cal"] == [0, 0, 3, 0, 4.5]).all()
    assert (actual["monetary_value_holdout"] == [2, 0, 0, 3, 0]).all()
    def transactions_rfm(self, config, transactions):
        """
        docstring
        """
        self.calibration_period_end = config['parameters'][
            'calibration_period_end']
        self.observation_period_end = config['parameters'][
            'observation_period_end']

        transactions['trans_date'] = pd.to_datetime(transactions['trans_date'])

        self.transactions_holdout = calibration_and_holdout_data(
            transactions,
            'customer_id',
            'trans_date',
            calibration_period_end=self.calibration_period_end,
            observation_period_end=self.observation_period_end)

        self.tr_subset = transactions.loc[(
            transactions['trans_date'] <= self.calibration_period_end
        )].groupby('customer_id')['tran_amount'].mean().reset_index()
        self.tr_holdout = transactions.loc[
            (transactions['trans_date'] > self.calibration_period_end)
            & (transactions['trans_date'] <= self.observation_period_end
               )].groupby('customer_id')['tran_amount'].mean().reset_index()

        self.tr_subset.rename(columns={'tran_amount': 'monetary_cal'},
                              inplace=True)
        self.tr_holdout.rename(columns={'tran_amount': 'monetary_holdout'},
                               inplace=True)

        self.transactions_holdout = pd.merge(self.transactions_holdout,
                                             self.tr_subset,
                                             on='customer_id',
                                             how='left')
        self.transactions_holdout = pd.merge(self.transactions_holdout,
                                             self.tr_holdout,
                                             on='customer_id',
                                             how='left')

        self.transactions_holdout.fillna(0, inplace=True)
        return self.transactions_holdout
示例#24
0
def test_calibration_and_holdout_data_gives_correct_date_boundaries():

    d = [
            [1, '2015-01-01'],
            [1, '2015-02-06'], # excluded from both holdout and calibration
            [2, '2015-01-01'],
            [3, '2015-01-01'],
            [3, '2015-01-02'],
            [3, '2015-01-05'],
            [4, '2015-01-16'],
            [4, '2015-02-02'],
            [4, '2015-02-05'], # excluded from both holdout and calibration
            [5, '2015-01-16'],
            [5, '2015-01-17'],
            [5, '2015-01-18'],
            [6, '2015-02-02'],
    ]
    transactions = pd.DataFrame(d, columns=['id', 'date'])
    actual = utils.calibration_and_holdout_data(transactions, 'id', 'date', calibration_period_end='2015-02-01', observation_period_end='2015-02-04')
    assert actual['frequency_holdout'].ix[1] == 0 
    assert actual['frequency_holdout'].ix[4] == 1 
def reformat(data):
    data['Timestamp'] = pd.to_datetime(data['Timestamp'],
                                       format='%Y/%m/%d %H:%M').dt.date

    summary = summary_data_from_transaction_data(
        transactions=data,
        customer_id_col='CustomerID',
        datetime_col='Timestamp',
        monetary_value_col='PurchaseValue',
        observation_period_end='2017-12-06',
        freq='D').reset_index()

    summary_cal_holdout = calibration_and_holdout_data(
        transactions=data,
        customer_id_col='CustomerID',
        datetime_col='Timestamp',
        calibration_period_end='2017-07-12',
        observation_period_end='2017-12-06',
        freq='D',
        monetary_value_col='PurchaseValue').reset_index()

    return summary, summary_cal_holdout
示例#26
0
def _summary_calibration_and_holdout(transactions):
    """
    Creates Summary (RFM) data for Model Training and Evaluation using transactions data.

    Parameters
    ---------
        transactions: transaction data with customer_unique_id and order_purchase_timestamp.

    Yields
    ------
        summary_cal_holdout.csv.
    """
    summary_cal_holdout = calibration_and_holdout_data(transactions=transactions,
                                                       customer_id_col="customer_unique_id",
                                                       datetime_col="order_purchase_timestamp",
                                                       calibration_period_end="2017-12-31",
                                                       observation_period_end="2018-08-31",
                                                       freq="D",
                                                       freq_multiplier=1)

    # Saving file.
    file_path = Path.cwd() / "datasets/summary_cal_holdout.csv"
    summary_cal_holdout.to_csv(file_path, index=False)
    return
示例#27
0
    'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t, data['frequency'], data['recency'], data['T'])
data.sort_values(by='predicted_purchases').tail(5)

# In[58]:

from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

# In[45]:

from lifetimes.utils import calibration_and_holdout_data

summary_cal_holdout = calibration_and_holdout_data(
    df,
    'driver_id',
    'timestamp',
    calibration_period_end='2016-05-01',
    observation_period_end='2016-06-27')
print(summary_cal_holdout.head())

# In[46]:

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

bgf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

# In this plot, we separate the data into both a in-sample (calibration) and validation (holdout) period.
# The sample period consists the beginning to 2016–03–28; the validation period spans from 2016–05–01 to
# 2016–06–27. The plot groups all customers in the calibration period by their number of repeat purchases
示例#28
0
# plot = plot_frequency_recency_matrix(bgf)

# prob = plot_probability_alive_matrix(bgf)

t = 1
conv[
    'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t, conv['frequency'], conv['recency'], conv['T'])
conv.sort_values(by='predicted_purchases').tail(5)

plot_period_transactions(bgf)

summary_cal_holdout = calibration_and_holdout_data(
    df,
    'id',
    'date',
    calibration_period_end='2010-01-01',
    observation_period_end='2017-01-01')

print(summary_cal_holdout.head())

bgf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

from lifetimes.plotting import plot_history_alive

id = 35
days_since_birth = 365
sp_trans = df.loc[df['id'] == id]
# sp_trans
示例#29
0
"""
BG/NBD is an attractive alternative to the Pareto/NBD, which costs less computation and yields similar results.
"""
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(data['frequency'], data['recency'], data['T'])
print(bgf)
# For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood,
# we can control how large these parameters can be. This is implemented as setting as positive penalizer_coef in the
# initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective.

# Model fit
plot_period_transactions(bgf)  # Calibration

summary_cal_holdout = calibration_and_holdout_data(
    df,
    customer_id,
    date_col,
    calibration_period_end='2011-06-08',
    observation_period_end='2011-12-9')
# Create the test data set
print(summary_cal_holdout.head())
bgf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)
# Visualization

plot_frequency_recency_matrix(bgf)
plot_probability_alive_matrix(bgf)
plt.show()

### Gamma-Gamma model###
returning_customers_summary = data[data['frequency'] > 0]
示例#30
0
plot_probability_alive_matrix(bgf)

t = 1  # values from 0.1 to 1
data[
    'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t, data['frequency'], data['recency'], data['T'])
data.sort_values(by='predicted_purchases').tail(10)

from lifetimes.plotting import plot_period_transactions
plot_period_transactions(bgf)

from lifetimes.utils import calibration_and_holdout_data

calibration_data = calibration_and_holdout_data(
    df,
    'UserId',
    'OrderDate',
    calibration_period_end='2009-12-15',
    observation_period_end='2010-01-09')
calibration_data.head()

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

bgf.fit(calibration_data['frequency_cal'], calibration_data['recency_cal'],
        calibration_data['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, calibration_data)

with_frequency = data[data['frequency'] > 0]
with_frequency.head()

with_frequency[['monetary_value', 'frequency']].corr()
# heat map for FR and live customers
from lifetimes.plotting import plot_frequency_recency_matrix

plot_frequency_recency_matrix(bgf)

from lifetimes.plotting import plot_probability_alive_matrix

plot_probability_alive_matrix(bgf)

#validation of model

from lifetimes.utils import calibration_and_holdout_data

summary_cal_holdout = calibration_and_holdout_data(
    tran_pa_all,
    'customer_id',
    'bill_date_new',
    calibration_period_end='2017-09-01',
    observation_period_end='2017-12-31')
print(summary_cal_holdout.head())

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

bgf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

#t = 10 #predict purchases in 10 periods
#individual = summary.iloc[20]
# The below function may be renamed to `predict` in a future version of lifetimes
#bgf.conditional_expected_number_of_purchases_up_to_time(t, individual['frequency'], individual['recency'], individual['T'])
示例#32
0
t = 12
summary[
    'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(
        t, summary['frequency'], summary['recency'], summary['T'])
summary.sort_values(by='predicted_purchases').tail(10)

from lifetimes.plotting import plot_period_transactions
plt.rcParams['figure.figsize'] = [12, 3]

plot_period_transactions(bgf)

from lifetimes.utils import calibration_and_holdout_data

summary_cal_holdout = calibration_and_holdout_data(
    df,
    'customerId',
    'date',
    calibration_period_end='2019-03-31',
    observation_period_end='2019-09-12')

df.date.min()

df.date.max()

summary_cal_holdout.head()

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

bgf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf,
                                                summary_cal_holdout,
示例#33
0
def get_clv(oracle_conn_id, src_client_id, storage_bucket, ds, **context):
    import matplotlib.pyplot
    matplotlib.pyplot.ioff()
    ##
    from lifetimes.utils import calibration_and_holdout_data
    from lifetimes.plotting import plot_frequency_recency_matrix
    from lifetimes.plotting import plot_probability_alive_matrix
    from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
    from lifetimes.plotting import plot_period_transactions
    from lifetimes.plotting import plot_history_alive
    from lifetimes.plotting import plot_cumulative_transactions
    from lifetimes.utils import expected_cumulative_transactions
    from lifetimes.utils import summary_data_from_transaction_data
    from lifetimes import BetaGeoFitter
    from lifetimes import GammaGammaFitter
    import datetime
    import pandas as pd
    import datalab.storage as gcs
    conn = OracleHook(oracle_conn_id=oracle_conn_id).get_conn()
    print(src_client_id, context)
    query = context['templates_dict']['query']
    data = pd.read_sql(query, con=conn)
    data.columns = data.columns.str.lower()
    print(data.head())

    # Calculate RFM values#
    calibration_end_date = datetime.datetime(2018, 5, 24)
    training_rfm = calibration_and_holdout_data(
        transactions=data,
        customer_id_col='src_user_id',
        datetime_col='pickup_date',
        calibration_period_end=calibration_end_date,
        freq='D',
        monetary_value_col='price_total')
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(training_rfm['frequency_cal'], training_rfm['recency_cal'],
            training_rfm['T_cal'])
    print(bgf)

    # Matrix charts
    plot_period_transactions_chart = context.get("ds_nodash") + str(
        src_client_id) + '_plot_period_transactions_chart.svg'
    plot_frequency_recency_chart = context.get("ds_nodash") + str(
        src_client_id) + '_plot_frequency_recency_matrix.svg'
    plot_probability_chart = context.get("ds_nodash") + str(
        src_client_id) + '_plot_probability_alive_matrix.svg'
    plot_calibration_vs_holdout_chart = context.get("ds_nodash") + str(
        src_client_id) + '_plot_calibration_vs_holdout_purchases.svg'

    ax0 = plot_period_transactions(bgf, max_frequency=30)
    ax0.figure.savefig(plot_period_transactions_chart, format='svg')
    ax1 = plot_frequency_recency_matrix(bgf)
    ax1.figure.savefig(plot_frequency_recency_chart, format='svg')
    ax2 = plot_probability_alive_matrix(bgf)
    ax2.figure.savefig(plot_probability_chart, format='svg')
    ax3 = plot_calibration_purchases_vs_holdout_purchases(bgf,
                                                          training_rfm,
                                                          n=50)
    ax3.figure.savefig(plot_calibration_vs_holdout_chart, format='svg')
    full_rfm = summary_data_from_transaction_data(
        data,
        customer_id_col='src_user_id',
        datetime_col='pickup_date',
        monetary_value_col='price_total',
        datetime_format=None,
        observation_period_end=None,
        freq='D')
    returning_full_rfm = full_rfm[full_rfm['frequency'] > 0]
    ggf = GammaGammaFitter(penalizer_coef=0)
    ggf.fit(returning_full_rfm['frequency'],
            returning_full_rfm['monetary_value'])

    customer_lifetime = 30  # expected number of months lifetime of a customer
    clv = ggf.customer_lifetime_value(
        bgf,  #the model to use to predict the number of future transactions
        full_rfm['frequency'],
        full_rfm['recency'],
        full_rfm['T'],
        full_rfm['monetary_value'],
        time=customer_lifetime,  # months
        discount_rate=0.01  # monthly discount rate ~ 12.7% annually
    ).sort_values(ascending=False)
    full_rfm_with_value = full_rfm.join(clv)

    full_rfm_file = context.get("ds_nodash") + "-src_client_id-" + str(
        src_client_id) + '-icabbi-test.csv'
    full_rfm_with_value.to_csv(full_rfm_file)
    GoogleCloudStorageHook(
        google_cloud_storage_conn_id='google_conn_default').upload(
            bucket=storage_bucket,
            object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" +
            full_rfm_file,
            filename=full_rfm_file)
    GoogleCloudStorageHook(
        google_cloud_storage_conn_id='google_conn_default').upload(
            bucket=storage_bucket,
            object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" +
            plot_period_transactions_chart,
            filename=full_rfm_file)
    GoogleCloudStorageHook(
        google_cloud_storage_conn_id='google_conn_default').upload(
            bucket=storage_bucket,
            object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" +
            plot_frequency_recency_chart,
            filename=full_rfm_file)
    GoogleCloudStorageHook(
        google_cloud_storage_conn_id='google_conn_default').upload(
            bucket=storage_bucket,
            object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" +
            plot_probability_chart,
            filename=full_rfm_file)
    GoogleCloudStorageHook(
        google_cloud_storage_conn_id='google_conn_default').upload(
            bucket=storage_bucket,
            object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" +
            plot_calibration_vs_holdout_chart,
            filename=full_rfm_file)