Exemplo n.º 1
0
def preprocessing(months, user_ids=None, featurizer=None, features=None, prefilter=True, dfs=None):
    ''' 
    Takes a list of users_id, creates the relevant window from their first deposit date
    and featurizes within it.

    Args:
        months: Number of months ahead the frame looks from the first deposit date 
        user_ids: List of integer user_ids to use in this sample
        demo_df: The demographic info to pull the user's 
        featurizer: Optional featurizer object
        features: Optional list of features to use, if none it'll use every feature in the featurizer
        prefilter: Whether to apply prefilters such as activity threshold and rg-frame filtering
        dfs: The information associated with the users
    Returns:
        X: ndarray of the the featurized rows
        y: Labels associated with each row of X
        user_ids: The user_ids associated with each row of X 
    '''
    if not featurizer and not features:
        print("Need at least one way to get featurizing context!")
        raise ValueError
    if not dfs:
        demo_df, rg_df, gam_df = get_demo_df(), get_rg_df(), get_gam_df()
    else:
        demo_df, rg_df, gam_df = dfs
    if not user_ids:
        user_ids = list(demo_df.index)

    days = months * 30
    if prefilter:
        print("Applying prefilters")
        user_ids = prefilters(user_ids, months*30, demo_df, rg_df)
    print(f"Constructing model with {months} months of information")
    print(f"Features being used: {features}")
    X, y = featurize(user_ids, gam_df, demo_df, featurizer=featurizer, features=features, month_window=months)
    return X, y, user_ids
Exemplo n.º 2
0
        rgs: The label associated with each row
    '''
    print("Starting frame making")
    if not featurizer:
        featurizer = make_default_featurizer()
    frames = [
        make_frame(user_id, gam_df, demo_df, month_window)
        for user_id in user_ids
    ]
    rgs = [demo_df.loc[user_id, 'rg'] == 1 for user_id in user_ids]
    return featurizer.vectorize(frames, features), rgs


def make_frame(user_id, gam_df, demo_df, month_window):
    '''Featurizes a single user'''
    mask = (gam_df['user_id'] == user_id)
    user_daily = gam_df[mask]
    first_deposit = demo_df.loc[user_id, 'first_deposit_date']
    user_frame = sparse_to_ts(user_daily,
                              date_start=first_deposit,
                              window=30 * month_window)
    return user_frame


if __name__ == '__main__':
    demo_df = get_demo_df()
    gam_df = get_gam_df()
    rg_df = get_rg_df()
    user_ids = list(demo_df.index)
    print(len(user_ids))
Exemplo n.º 3
0
        return vect

    def add_feature(self, prod_function, feat_name=None, args={}):
        if not feat_name:
            feat_name = prod_function.__name__
        self.features[feat_name] = lambda x: prod_function(x, **args)

    def delete_feature(self, feat_name):
        del self.features[feat_name]

    def get_feature_names(self):
        return list(self.features.keys())


if __name__ == "__main__":
    demo_df = pipeline.get_demo_df()
    gam_df = pipeline.get_gam_df()
    rg_info = pipeline.get_rg_df()
    user_id = 3327778

    featurizer = Featurizer()
    featurizer.add_feature(total_hold)
    featurizer.add_feature(max_hold)
    featurizer.add_feature(weekly_hold)
    featurizer.add_feature(weekly_rolling_hold)

    mask = (gam_df['user_id'] == user_id)
    user_daily = daily_gam_df[mask]
    first_deposit = demo_df.loc[user_id, 'first_deposit_date']
    user_frame = sparse_to_ts(user_daily, date_start=first_deposit, window=180)
    features_to_use = ["total_hold", "max_hold", "weekly_hold"]
Exemplo n.º 4
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
from plot_helper import *
from sklearn.metrics import roc_curve
import pipeline
from pipeline_constants import *

rcParams.update({'figure.autolayout': True})
plt.style.use('ggplot')

demo_df = pipeline.get_demo_df()  # Global vars weee
gam_df = pipeline.get_gam_df()
rg_info = pipeline.get_rg_df()


def background_plot(ax, user_id, gam_df, window=30 * 6):
    '''Plots the introductory "Wow people lose a lot on this" graph'''
    mask = (gam_df['user_id'] == user_id)
    user_daily = gam_df[mask]
    first_deposit = demo_df.loc[user_id, 'first_deposit_date']
    user_frame = pipeline.sparse_to_ts(user_daily,
                                       date_start=first_deposit,
                                       window=window)
    user_frame['cumul_hold'] = user_frame['hold'].cumsum()
    ax.set_title(f'User #{user_id}', fontsize=28)
    ax.set_xlabel("Date", fontsize=28)
    ax.set_ylabel("Loss (Euros)", fontsize=28)
    ax.tick_params(axis="y", labelsize=20)
    ax.tick_params(axis="x", labelsize=20)
from pipeline import get_demo_df, get_gam_df, get_rg_df
from processing.features import SUMMARY_NAMES, DAILY_NAMES, WEEKLY_NAMES
from processing.preprocessing import preprocessing
from model import predict

if __name__ == '__main__':
        print("Running on holdout!")
        sleep(10)
        HOLD_DEMO_PATH = 'data/holdout/demographic.csv'
        HOLD_RG_PATH = 'data/holdout/rg_information.csv'
        HOLD_GAM_PATH = 'data/holdout/gambling.csv'
        hold_demo = get_demo_df(HOLD_DEMO_PATH)
        hold_rg = get_rg_df(HOLD_RG_PATH)
        hold_gam = get_gam_df(HOLD_GAM_PATH)
        dfs = [hold_demo, hold_rg, hold_gam]
        #model = 
        #features = S 
        X, y, user_ids = preprocessing(months=months, features=features, dfs=dfs)
        predict(model, X, y, user_ids, store_name="holdout" store=True)