def rf_model(read_csv=True): print 'rf_model' train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=True) cols = [i for i in train_df.columns if i not in EXCLUDE_COLS] X = train_df[cols] y = train_df['country_destination'] clf = RandomForestClassifier(n_estimators=20) # make submission # clf.fit(X.values, y) # py.test.set_trace() # test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]] # y_pred = bst.predict_proba(test_data) # print 'predicted submission prob' # py.test.set_trace() # sub = create_kaggle_submission(y_pred, test_df['id'], None) # print 'created kaggle submission' # kf = KFold(len(X), n_folds=10, random_state=42) # score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer) param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False] } n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring=ndcg_scorer) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) py.test.set_trace() return
def xgb_model(read_csv=True): print 'xgb_model randomcv yr > 2013' train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=False) train_df = train_df[train_df['tfa_year'] > 2013] cols = [i for i in train_df.columns if i not in EXCLUDE_COLS] X = train_df[cols] y = train_df['country_destination'] #start classifier bst = xgb.XGBClassifier(nthread=4) # bst = xgb.XGBClassifier(max_depth=2, nthread=4, # n_estimators=50,subsample=0.4,learning_rate=0.0.05) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=10000) # bst.fit(test_X, test_y) param_dist = {"max_depth": [2,4,6], "learning_rate": [0.05, 0.1, 0.15, 0.2], "n_estimators": [30, 50, 70], # "min_samples_leaf": sp_randint(1, 11), # "min_samples_split": sp_randint(1,11), 'subsample': [0.4, 0.5, 0.6] # 'max_features': [20, 50, 100] } n_iter_search = 20 random_search = RandomizedSearchCV(bst, param_distributions=param_dist, n_iter=n_iter_search, scoring=ndcg_scorer) start = time() random_search.fit(test_X, test_y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # bagging # clfbag = BaggingClassifier(bst, n_estimators=5, max_samples=5000) # clfbag.fit(train_X, train_y) # y_pred = clfbag.predict_proba(test_X) # print 'predicted prob' # score = ndcg_score(test_y, y_pred) # apply learned model # bst.fit(X.values, y) # py.test.set_trace() test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]] y_pred = random_search.predict_proba(test_data) py.test.set_trace() # kaggle_test = pd.read_csv('test.csv') sub = create_kaggle_submission(y_pred, test_df['id'], 0.841) py.test.set_trace() # print 'created kaggle sub' # kf = KFold(len(X), n_folds=10, random_state=42) # score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer) # param_dist = {"max_depth": [3, None], # "max_features": sp_randint(1, 11), # "min_samples_split": sp_randint(1, 11), # "min_samples_leaf": sp_randint(1, 11), # "bootstrap": [True, False], # "criterion": ["gini", "entropy"]} # n_iter_search = 20 # random_search = RandomizedSearchCV(clf, param_distributions=param_dist, # n_iter=n_iter_search, scoring=ndcg_scorer) # start = time() # random_search.fit(X, y) # print("RandomizedSearchCV took %.2f seconds for %d candidates" # " parameter settings." % ((time() - start), n_iter_search)) # report(random_search.grid_scores_) # py.test.set_trace() # sub = create_kaggle_submission(y_pred, test_df['id'], np.mean(score)) # py.test.set_trace() #end classifier """ trying cross valid