def decision_tree(load_model=False):
    start = time.time()
    if load_model == False:
        printYellow("*  Decision tree model training started...")

    # Create training set of 100,000 samples
    n_max = 100000
    X_dict_train, y_train = process_data(100000)

    # Transform training dictionary into one-hot encoded vectors
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)
    # print(len(X_train[0]))

    # Creating test set and turn into one-hot encoded vectors
    X_dict_test, y_test = process_data(100000, 100000)
    X_test = dict_one_hot_encoder.transform(X_dict_test)
    # print(len(X_test[0]))
    
    # Load Model
    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        d_tree_file = open('../models/decision_tree_model.sav', 'rb')
        decision_tree_final = pickle.load(d_tree_file)
        # d_tree_file.close()

        # Evaluate model on test set
        prob = decision_tree_final.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, prob)
        printGreen('✔  ROC AUC score on test set: {0:.3f}'.format(score))
        d_tree_file.close()
        return 0

    # Train decision tree classifier
    params = {'max_depth': [3, 10, None]}
    decision_tree_model = DecisionTreeClassifier(criterion='gini',
                                                 min_samples_split=30)
    grid_search = GridSearchCV(decision_tree_model, params, n_jobs=-1, cv=3, scoring='roc_auc')
    # print("Training started..")
    grid_search.fit(X_train, y_train)
    printGreen('✔  Decision tree model training complete..."\t\t{0:.1f}s'.format(time.time() - start))

    # Use model with best parameter as final model
    decision_tree_final = grid_search.best_estimator_

    # Evaluate and run model on training data
    prob = decision_tree_final.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, prob)
    printGreen('✔  ROC AUC score on test set: {0:.3f}'.format(score))

    # Save Model
    decision_tree_model_file = open('../models/decision_tree_model.sav', "wb")
    pickle.dump(decision_tree_final, decision_tree_model_file)
    decision_tree_model_file.close()
    printGreen('✔  Decision tree model saved...')

    return 0
def logistic_regression_ol(load_model=False):
    start = time.time()
    if load_model == False:
        printYellow("*  Logistic regression (using online learning) model training started...")

    # Build Classifier
    sgd_log_reg_model = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=1, learning_rate='constant', eta0=0.01)
    
    # Training sets
    X_dict_train, y_train = process_data(100000)
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)
    
    X_train_100k = X_train
    y_train_100k = np.array(y_train)

    # Test sets
    X_dict_test, y_test_next10k = process_data(10000, 100000)
    X_test_next10k = dict_one_hot_encoder.transform(X_dict_test)

    
    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        l_reg_file = open('../models/logistic_regression_model_ol.sav', 'rb')
        sgd_log_reg_model = pickle.load(l_reg_file)
        X_dict_test, y_test_next = process_data(10000, (20 + 1) * 200000)
        X_test_next = dict_one_hot_encoder.transform(X_dict_test)
        predict = sgd_log_reg_model.predict_proba(X_test_next)[:, 1]
        score = roc_auc_score(y_test_next, predict)
        printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))
        return 0

    # Train and partially fit on 1 million samples
    for i in range(20):
        X_dict_train, y_train_every = process_data(100000, i * 100000)
        X_train_every = dict_one_hot_encoder.transform(X_dict_train)
        sgd_log_reg_model.partial_fit(X_train_every, y_train_every, classes=[0, 1])
    
    printGreen('✔  Logistic regression (using online learning) model training complete..."\t\t{0:.1f}s'.format(time.time() - start))
    
    # Get test set
    X_dict_test, y_test_next = process_data(10000, (i + 1) * 200000)
    X_test_next = dict_one_hot_encoder.transform(X_dict_test)
    
    # Evaluate
    predict = sgd_log_reg_model.predict_proba(X_test_next)[:, 1]
    score = roc_auc_score(y_test_next, predict)
    printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))

    # Save Model
    l_reg_file = open('../models/logistic_regression_model_ol.sav', "wb")
    pickle.dump(sgd_log_reg_model, l_reg_file)
    l_reg_file.close()
    printGreen('✔  Logistic regression (using online learning) model saved...')
    return 0
def random_forest(load_model=False):
    start = time.time()
    if load_model == False:
        printYellow("*  Random forest model training started...")

    # Create training set of 100,000 samples
    n_max = 100000
    X_dict_train, y_train = process_data(100000)

    # Transform training dictionary into one-hot encoded vectors
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)

    # Creating test set and turn into one-hot encoded vectors
    X_dict_test, y_test = process_data(100000, 100000)
    X_test = dict_one_hot_encoder.transform(X_dict_test)

    # Load model instead of training again..
    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        r_forest_file = open('../models/random_forest_model.sav', 'rb')
        random_forest_final = pickle.load(r_forest_file)
        probs = random_forest_final.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, probs)
        printGreen('✔  ROC AUC score on test set: {0:.3f}'.format(score))
        r_forest_file.close()
        return 0
    
    # Train random forest classifier
    params = {'max_depth': [3, 10, None]}
    random_forest_model = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30,
                                                 n_jobs=-1)
    grid_search = GridSearchCV(random_forest_model, params, n_jobs=-1, cv=3, scoring='roc_auc')
    grid_search.fit(X_train, y_train)
    printGreen('✔  Random forest model training complete..."\t\t{0:.1f}s'.format(time.time() - start))

    # Use best paramter for final model
    random_forest_final = grid_search.best_estimator_

    # Evaluate model
    probs = random_forest_final.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, probs)
    printGreen('✔  ROC AUC score on test set: {0:.3f}'.format(score))

    # Save Model
    random_forest_file = open('../models/random_forest_model.sav', "wb")
    pickle.dump(random_forest_final, random_forest_file)
    random_forest_file.close()
    printGreen('✔  Random forest model saved...')
    return 0
def logistic_regression(sample_size=100000, load_model=False):
    start = time.time()
    if load_model == False:
        printYellow("*  Logistic regression model training started...")

    # Create Training Set
    n = sample_size
    X_dict_train, y_train = process_data(n)
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)

    # Create Test Set
    X_dict_test, y_test = process_data(n, n)
    X_test = dict_one_hot_encoder.transform(X_dict_test)

    X_train_n = X_train
    y_train_n = np.array(y_train)

    # Load model instead of training again
    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        l_reg_file = open('../models/logistic_regression_model.sav', 'rb')
        sgd_log_reg_model = pickle.load(l_reg_file)
        predictions = sgd_log_reg_model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, predictions)
        printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))
        return 0

    # Create SGD Logistic Regression Classifier
    sgd_log_reg_model = SGDClassifier(loss='log', penalty=None, fit_intercept=True,
                                      n_iter=5, learning_rate='constant', eta0=0.01)

    # Train Classifier
    sgd_log_reg_model.fit(X_train_n, y_train_n)
    printGreen('✔  Logistic regression model training complete..."\t\t{0:.1f}s'.format(time.time() - start))

    # Run model on test set
    predictions = sgd_log_reg_model.predict_proba(X_test)[:, 1]

    # Evaluate model
    score = roc_auc_score(y_test, predictions)
    printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))

    # Save model
    l_reg_file = open('../models/logistic_regression_model.sav', "wb")
    pickle.dump(sgd_log_reg_model, l_reg_file)
    l_reg_file.close()
    printGreen('✔  Logistic regression model saved...')
예제 #5
0
def split_data(df, status):
    start = time.time()

    df_train = df[df.invoicedate < '2011-10-09']
    df_train = df_train.reset_index(drop=True)
    df_val = df[(df.invoicedate >= '2011-10-09') & 
                (df.invoicedate <= '2011-11-09') ]
    
    df_val = df_val.reset_index(drop=True)
    df_test = df[df.invoicedate >= '2011-11-09']
    df_test = df_test.reset_index(drop=True)

    if (status):
        printGreen('✔ Split Data\t\t{0:.1f}s'.format(time.time() - start))
    
    # Pack data
    return [df_train, df_test, df_val]
def main():
    # Initial Message
    printGreen("Click-through rate models training started...\n")

    # Logistic Regression
    printGreen('Logistic Regression')
    logistic_regression(load_model=True)
    print('\n')

    # OL Logistic Regression
    printGreen('Logistic Regressions using Online Learning')
    logistic_regression_ol(load_model=True)
    print('\n')

    printGreen("✔  Done")
def logistic_regression_ol(load_model=True):
    start = time.time()

    if load_model == False:
        printYellow(
            "*  Logistic regression (using online learning) model training started..."
        )

    # Build Classifier
    og_reg_model = LogisticRegression()

    # Training sets
    X_dict_train, y_train = process_data()
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)

    X_train_100k = X_train
    y_train_100k = np.array(y_train)

    # Test sets
    X_dict_test, y_test_next10k = process_test_data()
    X_test_next10k = dict_one_hot_encoder.transform(X_dict_test)

    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        l_reg_file = open('./models/logistic_regression_model_ol.sav', 'rb')
        log_reg_model = pickle.load(l_reg_file)
        predictions = log_reg_model.predict_proba(X_test_next10k)[:, 1]
        score = roc_auc_score(y_test_next10k, predictions)
        printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))

        # Evaluate and run model on training data
        f = open('logistic_regression_ol_base_bid.csv', 'w')
        f.write('base_bid,clicks\n')
        for base_bid in range(50, 90, 1):
            score = bidding(predictions, base_bid)
            #printGreen('✔  clicks on test set:' + str(score))
        f.write(str(base_bid) + ',' + str(score) + '\n')
        f.close()
        printGreen('✔  clicks on test set:' + str(score))

        return 0
    '''
def logistic_regression(sample_size=100000, load_model=True):
    start = time.time()

    if load_model == False:
        printYellow("*  Logistic regression model training started...")

    # Create Training Set
    n = sample_size
    X_dict_train, y_train = process_data()
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    X_train = dict_one_hot_encoder.fit_transform(X_dict_train)

    # Create Test Set
    X_dict_test, y_test = process_test_data()
    X_test = dict_one_hot_encoder.transform(X_dict_test)

    X_train_n = X_train
    y_train_n = np.array(y_train)

    # Load model instead of training again
    if load_model == True:
        printGreen('✔  Loading model from previous training...')
        l_reg_file = open('./models/logistic_regression_model.sav', 'rb')
        log_reg_model = pickle.load(l_reg_file)
        predictions = log_reg_model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, predictions)
        printGreen("✔  ROC AUC score on test set: {0:.3f}".format(score))

        # Evaluate and run model on training data
        f = open('logistic_regression_base_bid.csv', 'w')
        f.write('base_bid,clicks\n')
        for base_bid in range(50, 90, 1):
            score = bidding(predictions, base_bid)
            #printGreen('✔  clicks on test set:' + str(score))
        f.write(str(base_bid) + ',' + str(score) + '\n')
        f.close()
        printGreen('✔  clicks on test set:' + str(score))

        return 0
    '''
def main():
    # Initial Message
    printGreen("Click-through rate models training started...\n")

    # Decision Tree
    printGreen('Decision Tree')
    decision_tree(load_model=True)
    print('\n')

    # Random Forest
    printGreen('Random Forest')
    random_forest(load_model=False)
    print('\n')

    # Logistic Regression
    printGreen('SGD Based Logistic Regression')
    logistic_regression(load_model=True)
    print('\n')

    # OL Logistic Regression
    printGreen('Logistic Regressions using Online Learning')
    logistic_regression_ol(load_model=True)
    print('\n')

    printGreen("✔  Done")
예제 #10
0
def recommender(customer_id, status):
    # Start time
    start = time.time()
    if status:
        printGreen('✔ RetailBox started..\t\t{0:.1f}s'.format(time.time() -
                                                              start))
    start = time.time()

    # Validate User Input
    validate_customer_id(customer_id)

    # Load Dataframe and create item_table, purchase matrix, etc.
    data = preprocess_data_rec_engine(status=True)

    item_table = data[0]
    purchase_sparse_matrix = data[1]
    customers = data[2]
    products = data[3]
    quantity = data[4]

    if status:
        printGreen('✔ Processed Data..\t\t{0:.1f}s'.format(time.time() -
                                                           start))
    start = time.time()

    # Split Data (Training/Test Split)
    training_test_split_data = split_data_mask(purchase_sparse_matrix,
                                               pct_test=0.2)

    product_training_set = training_test_split_data[0]
    product_test_set = training_test_split_data[1]
    product_user_altered = training_test_split_data[2]

    if status:
        printGreen(
            '✔ Split Data into Training and Test Sets..\t\t{0:.1f}s'.format(
                time.time() - start))
    start = time.time()

    # Train Recommendation Engine on given algorithm
    alpha = 15
    recommender_vecs = implicit.alternating_least_squares(
        (product_training_set * alpha).astype('double'),
        factors=20,
        regularization=0.1,
        iterations=50)

    user_vecs = recommender_vecs[0]
    item_vecs = recommender_vecs[1]

    customers_arr = np.array(customers)
    products_arr = np.array(products)

    if status:
        printGreen('✔ Recommender System Training Done..\t\t{0:.1f}s'.format(
            time.time() - start))
    start = time.time()

    # Lookup customer id
    cid = lookup_customer_id(customer_id)

    # Generate Recommendations for Customer
    rec_output = rec_items(cid, product_training_set, user_vecs, item_vecs,
                           customers_arr, products_arr, item_table)

    # Display Customer
    df = pd.read_pickle('../data/final/df_final.pkl')
    table_pickle_file = open('../data/final/df_customer_table.pkl', "rb")
    customer_table = pickle.load(table_pickle_file)
    table_pickle_file.close()
    search_customer(customer_id, df, customer_table)

    # Display Item Recommendations
    recommended_items_list = list_rec(rec_output)
    display_recommender_items(recommended_items_list)
예제 #11
0
def process_data(status):
    start = time.time()
    
    # Get file path for source dataset
    file_path = '../data/processed/df_retail.bin'
    df = None

    # Check if processed data file exists, if not, process raw dataset
    if os.path.exists(file_path) == False:
        df = pd.read_excel('../data/raw/Online Retail.xlsx')
        # # Save df into pickle file
        with open('../data/processed/df_retail.bin', 'wb') as f_out:
            pickle.dump(df, f_out)
    
    with open('../data/processed/df_retail.bin', 'rb') as f_in:
        df = pickle.load(f_in)

    # Display import status
    if status:
        printGreen('✔ Imported Data\t\t{0:.1f}s'.format(time.time() - start))
    
    # Clean Data, lowercase, remove "return" transaction, and unknown user transactions
    start = time.time()

    df.columns = df.columns.str.lower()
    df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)
    df.customerid = df.customerid.fillna(-1).astype('int32')

    # Encode item IDs with integers
    stockcode_values = df.stockcode.astype('str')
    stockcodes = sorted(set(stockcode_values))
    stockcodes = {c: i for (i, c) in enumerate(stockcodes)}
    df.stockcode = stockcode_values.map(stockcodes).astype('int32')

    # Display process status
    if status:
       printGreen('✔ Processed Data\t\t{0:.1f}s'.format(time.time() - start)) 

    # Store Customer IDs in a table
    start = time.time()
    i = 0
    counter = 0
    while counter != 532620:
        if df['customerid'][counter] not in customer_id and df['customerid'][counter] != -1 and df['customerid'][counter] != None:
            customer_id[df['customerid'][counter]] = i
            i += 1
        counter += 1
    
    # Save Customer ID Table
    customer_id_storage = open('../data/final/df_customer_table_long.pkl', "wb")
    pickle.dump(customer_id, customer_id_storage)
    customer_id_storage.close()
    
    # Display process status
    if status:
       printGreen('✔ Stored Customer Data in Table\t\t{0:.1f}s'.format(time.time() - start))

    # Save final DF for quick access
    df.to_pickle('../data/final/df_final.pkl')

    return df