def decision_tree(load_model=False): start = time.time() if load_model == False: printYellow("* Decision tree model training started...") # Create training set of 100,000 samples n_max = 100000 X_dict_train, y_train = process_data(100000) # Transform training dictionary into one-hot encoded vectors dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) # print(len(X_train[0])) # Creating test set and turn into one-hot encoded vectors X_dict_test, y_test = process_data(100000, 100000) X_test = dict_one_hot_encoder.transform(X_dict_test) # print(len(X_test[0])) # Load Model if load_model == True: printGreen('✔ Loading model from previous training...') d_tree_file = open('../models/decision_tree_model.sav', 'rb') decision_tree_final = pickle.load(d_tree_file) # d_tree_file.close() # Evaluate model on test set prob = decision_tree_final.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, prob) printGreen('✔ ROC AUC score on test set: {0:.3f}'.format(score)) d_tree_file.close() return 0 # Train decision tree classifier params = {'max_depth': [3, 10, None]} decision_tree_model = DecisionTreeClassifier(criterion='gini', min_samples_split=30) grid_search = GridSearchCV(decision_tree_model, params, n_jobs=-1, cv=3, scoring='roc_auc') # print("Training started..") grid_search.fit(X_train, y_train) printGreen('✔ Decision tree model training complete..."\t\t{0:.1f}s'.format(time.time() - start)) # Use model with best parameter as final model decision_tree_final = grid_search.best_estimator_ # Evaluate and run model on training data prob = decision_tree_final.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, prob) printGreen('✔ ROC AUC score on test set: {0:.3f}'.format(score)) # Save Model decision_tree_model_file = open('../models/decision_tree_model.sav', "wb") pickle.dump(decision_tree_final, decision_tree_model_file) decision_tree_model_file.close() printGreen('✔ Decision tree model saved...') return 0
def logistic_regression_ol(load_model=False): start = time.time() if load_model == False: printYellow("* Logistic regression (using online learning) model training started...") # Build Classifier sgd_log_reg_model = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=1, learning_rate='constant', eta0=0.01) # Training sets X_dict_train, y_train = process_data(100000) dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) X_train_100k = X_train y_train_100k = np.array(y_train) # Test sets X_dict_test, y_test_next10k = process_data(10000, 100000) X_test_next10k = dict_one_hot_encoder.transform(X_dict_test) if load_model == True: printGreen('✔ Loading model from previous training...') l_reg_file = open('../models/logistic_regression_model_ol.sav', 'rb') sgd_log_reg_model = pickle.load(l_reg_file) X_dict_test, y_test_next = process_data(10000, (20 + 1) * 200000) X_test_next = dict_one_hot_encoder.transform(X_dict_test) predict = sgd_log_reg_model.predict_proba(X_test_next)[:, 1] score = roc_auc_score(y_test_next, predict) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) return 0 # Train and partially fit on 1 million samples for i in range(20): X_dict_train, y_train_every = process_data(100000, i * 100000) X_train_every = dict_one_hot_encoder.transform(X_dict_train) sgd_log_reg_model.partial_fit(X_train_every, y_train_every, classes=[0, 1]) printGreen('✔ Logistic regression (using online learning) model training complete..."\t\t{0:.1f}s'.format(time.time() - start)) # Get test set X_dict_test, y_test_next = process_data(10000, (i + 1) * 200000) X_test_next = dict_one_hot_encoder.transform(X_dict_test) # Evaluate predict = sgd_log_reg_model.predict_proba(X_test_next)[:, 1] score = roc_auc_score(y_test_next, predict) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) # Save Model l_reg_file = open('../models/logistic_regression_model_ol.sav', "wb") pickle.dump(sgd_log_reg_model, l_reg_file) l_reg_file.close() printGreen('✔ Logistic regression (using online learning) model saved...') return 0
def random_forest(load_model=False): start = time.time() if load_model == False: printYellow("* Random forest model training started...") # Create training set of 100,000 samples n_max = 100000 X_dict_train, y_train = process_data(100000) # Transform training dictionary into one-hot encoded vectors dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) # Creating test set and turn into one-hot encoded vectors X_dict_test, y_test = process_data(100000, 100000) X_test = dict_one_hot_encoder.transform(X_dict_test) # Load model instead of training again.. if load_model == True: printGreen('✔ Loading model from previous training...') r_forest_file = open('../models/random_forest_model.sav', 'rb') random_forest_final = pickle.load(r_forest_file) probs = random_forest_final.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, probs) printGreen('✔ ROC AUC score on test set: {0:.3f}'.format(score)) r_forest_file.close() return 0 # Train random forest classifier params = {'max_depth': [3, 10, None]} random_forest_model = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) grid_search = GridSearchCV(random_forest_model, params, n_jobs=-1, cv=3, scoring='roc_auc') grid_search.fit(X_train, y_train) printGreen('✔ Random forest model training complete..."\t\t{0:.1f}s'.format(time.time() - start)) # Use best paramter for final model random_forest_final = grid_search.best_estimator_ # Evaluate model probs = random_forest_final.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, probs) printGreen('✔ ROC AUC score on test set: {0:.3f}'.format(score)) # Save Model random_forest_file = open('../models/random_forest_model.sav', "wb") pickle.dump(random_forest_final, random_forest_file) random_forest_file.close() printGreen('✔ Random forest model saved...') return 0
def logistic_regression(sample_size=100000, load_model=False): start = time.time() if load_model == False: printYellow("* Logistic regression model training started...") # Create Training Set n = sample_size X_dict_train, y_train = process_data(n) dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) # Create Test Set X_dict_test, y_test = process_data(n, n) X_test = dict_one_hot_encoder.transform(X_dict_test) X_train_n = X_train y_train_n = np.array(y_train) # Load model instead of training again if load_model == True: printGreen('✔ Loading model from previous training...') l_reg_file = open('../models/logistic_regression_model.sav', 'rb') sgd_log_reg_model = pickle.load(l_reg_file) predictions = sgd_log_reg_model.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, predictions) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) return 0 # Create SGD Logistic Regression Classifier sgd_log_reg_model = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=5, learning_rate='constant', eta0=0.01) # Train Classifier sgd_log_reg_model.fit(X_train_n, y_train_n) printGreen('✔ Logistic regression model training complete..."\t\t{0:.1f}s'.format(time.time() - start)) # Run model on test set predictions = sgd_log_reg_model.predict_proba(X_test)[:, 1] # Evaluate model score = roc_auc_score(y_test, predictions) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) # Save model l_reg_file = open('../models/logistic_regression_model.sav', "wb") pickle.dump(sgd_log_reg_model, l_reg_file) l_reg_file.close() printGreen('✔ Logistic regression model saved...')
def split_data(df, status): start = time.time() df_train = df[df.invoicedate < '2011-10-09'] df_train = df_train.reset_index(drop=True) df_val = df[(df.invoicedate >= '2011-10-09') & (df.invoicedate <= '2011-11-09') ] df_val = df_val.reset_index(drop=True) df_test = df[df.invoicedate >= '2011-11-09'] df_test = df_test.reset_index(drop=True) if (status): printGreen('✔ Split Data\t\t{0:.1f}s'.format(time.time() - start)) # Pack data return [df_train, df_test, df_val]
def main(): # Initial Message printGreen("Click-through rate models training started...\n") # Logistic Regression printGreen('Logistic Regression') logistic_regression(load_model=True) print('\n') # OL Logistic Regression printGreen('Logistic Regressions using Online Learning') logistic_regression_ol(load_model=True) print('\n') printGreen("✔ Done")
def logistic_regression_ol(load_model=True): start = time.time() if load_model == False: printYellow( "* Logistic regression (using online learning) model training started..." ) # Build Classifier og_reg_model = LogisticRegression() # Training sets X_dict_train, y_train = process_data() dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) X_train_100k = X_train y_train_100k = np.array(y_train) # Test sets X_dict_test, y_test_next10k = process_test_data() X_test_next10k = dict_one_hot_encoder.transform(X_dict_test) if load_model == True: printGreen('✔ Loading model from previous training...') l_reg_file = open('./models/logistic_regression_model_ol.sav', 'rb') log_reg_model = pickle.load(l_reg_file) predictions = log_reg_model.predict_proba(X_test_next10k)[:, 1] score = roc_auc_score(y_test_next10k, predictions) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) # Evaluate and run model on training data f = open('logistic_regression_ol_base_bid.csv', 'w') f.write('base_bid,clicks\n') for base_bid in range(50, 90, 1): score = bidding(predictions, base_bid) #printGreen('✔ clicks on test set:' + str(score)) f.write(str(base_bid) + ',' + str(score) + '\n') f.close() printGreen('✔ clicks on test set:' + str(score)) return 0 '''
def logistic_regression(sample_size=100000, load_model=True): start = time.time() if load_model == False: printYellow("* Logistic regression model training started...") # Create Training Set n = sample_size X_dict_train, y_train = process_data() dict_one_hot_encoder = DictVectorizer(sparse=False) X_train = dict_one_hot_encoder.fit_transform(X_dict_train) # Create Test Set X_dict_test, y_test = process_test_data() X_test = dict_one_hot_encoder.transform(X_dict_test) X_train_n = X_train y_train_n = np.array(y_train) # Load model instead of training again if load_model == True: printGreen('✔ Loading model from previous training...') l_reg_file = open('./models/logistic_regression_model.sav', 'rb') log_reg_model = pickle.load(l_reg_file) predictions = log_reg_model.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, predictions) printGreen("✔ ROC AUC score on test set: {0:.3f}".format(score)) # Evaluate and run model on training data f = open('logistic_regression_base_bid.csv', 'w') f.write('base_bid,clicks\n') for base_bid in range(50, 90, 1): score = bidding(predictions, base_bid) #printGreen('✔ clicks on test set:' + str(score)) f.write(str(base_bid) + ',' + str(score) + '\n') f.close() printGreen('✔ clicks on test set:' + str(score)) return 0 '''
def main(): # Initial Message printGreen("Click-through rate models training started...\n") # Decision Tree printGreen('Decision Tree') decision_tree(load_model=True) print('\n') # Random Forest printGreen('Random Forest') random_forest(load_model=False) print('\n') # Logistic Regression printGreen('SGD Based Logistic Regression') logistic_regression(load_model=True) print('\n') # OL Logistic Regression printGreen('Logistic Regressions using Online Learning') logistic_regression_ol(load_model=True) print('\n') printGreen("✔ Done")
def recommender(customer_id, status): # Start time start = time.time() if status: printGreen('✔ RetailBox started..\t\t{0:.1f}s'.format(time.time() - start)) start = time.time() # Validate User Input validate_customer_id(customer_id) # Load Dataframe and create item_table, purchase matrix, etc. data = preprocess_data_rec_engine(status=True) item_table = data[0] purchase_sparse_matrix = data[1] customers = data[2] products = data[3] quantity = data[4] if status: printGreen('✔ Processed Data..\t\t{0:.1f}s'.format(time.time() - start)) start = time.time() # Split Data (Training/Test Split) training_test_split_data = split_data_mask(purchase_sparse_matrix, pct_test=0.2) product_training_set = training_test_split_data[0] product_test_set = training_test_split_data[1] product_user_altered = training_test_split_data[2] if status: printGreen( '✔ Split Data into Training and Test Sets..\t\t{0:.1f}s'.format( time.time() - start)) start = time.time() # Train Recommendation Engine on given algorithm alpha = 15 recommender_vecs = implicit.alternating_least_squares( (product_training_set * alpha).astype('double'), factors=20, regularization=0.1, iterations=50) user_vecs = recommender_vecs[0] item_vecs = recommender_vecs[1] customers_arr = np.array(customers) products_arr = np.array(products) if status: printGreen('✔ Recommender System Training Done..\t\t{0:.1f}s'.format( time.time() - start)) start = time.time() # Lookup customer id cid = lookup_customer_id(customer_id) # Generate Recommendations for Customer rec_output = rec_items(cid, product_training_set, user_vecs, item_vecs, customers_arr, products_arr, item_table) # Display Customer df = pd.read_pickle('../data/final/df_final.pkl') table_pickle_file = open('../data/final/df_customer_table.pkl', "rb") customer_table = pickle.load(table_pickle_file) table_pickle_file.close() search_customer(customer_id, df, customer_table) # Display Item Recommendations recommended_items_list = list_rec(rec_output) display_recommender_items(recommended_items_list)
def process_data(status): start = time.time() # Get file path for source dataset file_path = '../data/processed/df_retail.bin' df = None # Check if processed data file exists, if not, process raw dataset if os.path.exists(file_path) == False: df = pd.read_excel('../data/raw/Online Retail.xlsx') # # Save df into pickle file with open('../data/processed/df_retail.bin', 'wb') as f_out: pickle.dump(df, f_out) with open('../data/processed/df_retail.bin', 'rb') as f_in: df = pickle.load(f_in) # Display import status if status: printGreen('✔ Imported Data\t\t{0:.1f}s'.format(time.time() - start)) # Clean Data, lowercase, remove "return" transaction, and unknown user transactions start = time.time() df.columns = df.columns.str.lower() df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True) df.customerid = df.customerid.fillna(-1).astype('int32') # Encode item IDs with integers stockcode_values = df.stockcode.astype('str') stockcodes = sorted(set(stockcode_values)) stockcodes = {c: i for (i, c) in enumerate(stockcodes)} df.stockcode = stockcode_values.map(stockcodes).astype('int32') # Display process status if status: printGreen('✔ Processed Data\t\t{0:.1f}s'.format(time.time() - start)) # Store Customer IDs in a table start = time.time() i = 0 counter = 0 while counter != 532620: if df['customerid'][counter] not in customer_id and df['customerid'][counter] != -1 and df['customerid'][counter] != None: customer_id[df['customerid'][counter]] = i i += 1 counter += 1 # Save Customer ID Table customer_id_storage = open('../data/final/df_customer_table_long.pkl', "wb") pickle.dump(customer_id, customer_id_storage) customer_id_storage.close() # Display process status if status: printGreen('✔ Stored Customer Data in Table\t\t{0:.1f}s'.format(time.time() - start)) # Save final DF for quick access df.to_pickle('../data/final/df_final.pkl') return df