def train_run(): l_reg = XGBClassifier(seed=27, silent=False, verbose_eval=True, subsample=0.8, reg_lambda=5.0, n_estimators=800, min_child_weight=1, max_depth=15, learning_rate=0.03, gamma=0.5, colsample_bytree=0.9, colsample_bylevel=0.7) file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv') data = pd.read_csv(file_path) data, y_hat, scaler, pca = data_cleaner_train(data) data.loc[:, 'yhat'] = y_hat d1 = data.loc[data.yhat == 1, :] d2 = data.loc[data.yhat == 0, :] data_op = pd.DataFrame() for i in range(3): print(i) d1_sample = d1.sample(frac=1) d2_sample = d2.sample(n=len(d1_sample), replace=False) data_tmp = d1_sample.append(d2_sample) data_op = data_op.append(data_tmp, ignore_index=True) data_op = data_op.sample(frac=1) yhat_op = data_op.loc[:, 'yhat'] data_op.drop('yhat', axis=1, inplace=True) data.drop('yhat', axis=1, inplace=True) # # specify parameters and distributions to sample from # one_to_left = st.beta(10, 1) # from_zero_positive = st.expon(0, 50) # # param_dist = { # 'silent': [False], # 'max_depth': [15, 10,15], # 'learning_rate': [0.03, 0.05, 0.1], # 'subsample': [0.7, 0.8, 0.9, 1.0], # 'colsample_bytree': [0.7, 0.8, 0.9], # 'colsample_bylevel': [0.7, 0.8, 0.9], # 'min_child_weight': [0.5, 1.0, 3.0, 7.0, 10.0], # 'gamma': [0.25, 0.5, 1.0], # 'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0], # 'n_estimators': [200, 500, 800]} # # # run randomized search # n_iter_search = 10 # random_search = RandomizedSearchCV(l_reg, param_distributions=param_dist, # n_iter=n_iter_search, cv=3, scoring='roc_auc', n_jobs=3) # # model = random_search.fit(data_op.values, yhat_op.values) # report(random_search.cv_results_) model = l_reg.fit(data_op.values, yhat_op.values) output = model.predict(data.values) roc = roc_auc_score(y_hat.values, output) print('#############################################################') print(roc) joblib.dump(pca, pca_file_name) joblib.dump(model, model_filename) joblib.dump(scaler, scaler_filename)
def train_run(): l_reg = RandomForestClassifier(random_state=0, n_jobs=3, verbose=3) file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv') data = pd.read_csv(file_path) data, y_hat, scaler = data_cleaner_train(data) data.loc[:, 'yhat'] = y_hat d1 = data.loc[data.yhat == 1, :] d2 = data.loc[data.yhat == 0, :] data_op = pd.DataFrame() for i in range(5): print(i) d1_sample = d1.sample(frac=1, replace=False) d2_sample = d2.sample(n=len(d1_sample), replace=False) data_tmp = d1_sample.append(d2_sample) data_op = data_op.append(data_tmp, ignore_index=True) data_op = data_op.sample(frac=1) yhat_op = data_op.loc[:, 'yhat'] # predictors = list(data_op.drop('yhat', axis=1).columns) # modelfit(l_reg, data_op, predictors, performCV=True, printFeatureImportance=True, cv_folds=5) data_op.drop('yhat', axis=1, inplace=True) data.drop('yhat', axis=1, inplace=True) # specify parameters and distributions to sample from param_dist = { "max_depth": [3, 10, 20, 30, 40, 50], "max_features": sp_randint(5, 50), "min_samples_split": sp_randint(2, 20), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search n_iter_search = 10 random_search = RandomizedSearchCV(l_reg, param_distributions=param_dist, n_iter=n_iter_search, cv=5, scoring='roc_auc', n_jobs=1) model = random_search.fit(data_op.values, yhat_op.values) report(random_search.cv_results_) # model = l_reg.fit(data_op.values, yhat_op.values) output = model.predict(data.values) roc = roc_auc_score(y_hat.values, output) print(roc) joblib.dump(model, model_filename) joblib.dump(scaler, scaler_filename)
def train_run(): l_reg = GradientBoostingClassifier(max_features=None, subsample=0.80, random_state=0, verbose=True, learning_rate=0.05, n_estimators=200, min_samples_split=500, min_samples_leaf=500, max_depth=15) file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv') data = pd.read_csv(file_path) data, y_hat, scaler, pca = data_cleaner_train(data) data.loc[:, 'yhat'] = y_hat d1 = data.loc[data.yhat == 1, :] d2 = data.loc[data.yhat == 0, :] data_op = pd.DataFrame() for i in range(1): print(i) d1_sample = d1.sample(frac=1) d2_sample = d2.sample(n=len(d1_sample), replace=False) data_tmp = d1_sample.append(d2_sample) data_op = data_op.append(data_tmp, ignore_index=True) data_op = data_op.sample(frac=1) yhat_op = data_op.loc[:, 'yhat'] data_op.drop('yhat', axis=1, inplace=True) data.drop('yhat', axis=1, inplace=True) # # specify parameters and distributions to sample from # param_dist = {'max_depth': range(10, 30,5), 'min_samples_split': range(100, 1000, 200), 'n_estimators': range(100, 1000, 300), 'min_samples_leaf': range(100, 1000, 200)} # # # run randomized search # n_iter_search = 10 # random_search = RandomizedSearchCV(l_reg, param_distributions=param_dist, # n_iter=n_iter_search, cv=3, scoring='roc_auc', n_jobs=3) # # model = random_search.fit(data_op.values, yhat_op.values) # report(random_search.cv_results_) model = l_reg.fit(data_op.values, yhat_op.values) output = model.predict(data.values) roc = roc_auc_score(y_hat.values, output) print('#############################################################') print(roc) joblib.dump(pca, pca_file_name) joblib.dump(model, model_filename) joblib.dump(scaler, scaler_filename)
def train_run(): l_reg = RandomForestClassifier(n_estimators=500, max_depth=30, random_state=0, n_jobs=7, verbose=2) file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv') userlogs_path = os.path.join( os.getcwd(), 'input_data/train_amex/historical_user_logs.csv') data = pd.read_csv(file_path) userlogs = pd.read_csv(userlogs_path) data, y_hat, scaler, to_keep_prod_2, is_interesting = data_cleaner_train( data, userlogs) with open(to_keep_file_name, 'wb') as f: pickle.dump(to_keep_prod_2, f) is_interesting.to_pickle(is_int_fname) data.loc[:, 'yhat'] = y_hat d1 = data.loc[data.yhat == 1, :] d2 = data.loc[data.yhat == 0, :] data_op = pd.DataFrame() yhat_op = pd.DataFrame() for i in range(50): print(i) d1_sample = d1.sample(n=10000, replace=True) d2_sample = d2.sample(n=len(d1_sample), replace=True) data_tmp = d1_sample.append(d2_sample) data_op = data_op.append(data_tmp, ignore_index=True) yhat_op = data_op.loc[:, 'yhat'] data_op.drop('yhat', axis=1, inplace=True) data.drop('yhat', axis=1, inplace=True) data = data.sample(frac=1) model = l_reg.fit(data_op.values, yhat_op.values) output = model.predict(data.values) # output[output > 0.5] = 1 # output[output < 0.5] = 0 roc = roc_auc_score(y_hat.values, output) print(roc) joblib.dump(model, model_filename) joblib.dump(scaler, scaler_filename)