def get_feature_importance_df(importance_type='gain'): from xgboost_baseline import XGBoostModel # read train data. X, y = cu.get_train_data(encode_non_object=False) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # feature importance tmp = xgbm.base_model.get_score(importance_type=importance_type) columns, importances = [], [] for c, i in tmp.items(): columns.append(c) importances.append(i) importance_df = pd.DataFrame({ 'column_name': columns, 'importance': importances }) importance_df = importance_df.sort_values(by='importance', ascending=True) importance_df = importance_df.reset_index(drop=True) return importance_df
def run_grid(): # read train data. X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True) X = drop_columns(X) feature_cnt = X.columns.shape[0] print 'Grid Search.' parameters = { 'hidden_layer_sizes': [(feature_cnt + 1, ) * n for n in [1, 2, 3, 4, 5, 6]], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'max_iter': [200, 400, 600], 'early_stopping': [False, True] } grid = GridSearchCV(MLPRegressor(), parameters, cv=10, n_jobs=4, scoring='neg_mean_squared_error') grid.fit(X, y) print 'best_score_', grid.best_score_ print 'best_params_', grid.best_params_
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # train model. lrm = LinearRegressionModel() tarlist = [ c for c in X.columns if not c in 'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',') ] X_trans, propdic = getTransData(X, y, tarlist) x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y) lrm.train(x_train, y_train, None, None) y_pred = lrm.predict(x_holdout) score = abs(y_pred - y_holdout).mean() print(score) y_trans = [max([min([0.1, v]), -0.1]) for v in y] lrm.train(X_trans, y_trans, None, None) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = lrm.predict(T_trans[X_trans.columns].values) # write result. cu.write_result(y_pred) print(max(list(lrm.base_model.coef_))) print(min(y_pred))
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # read test data. T = cu.get_test_data(encode_non_object=True) # create base models. base_models = [ LinearRegressionModel(), XGBoostModel(), LightGBMModel() ] # setup ensemble parameters. ensemble = Ensemble( n_folds=10, stacker=LinearRegressionModel(), base_models=base_models ) # ensemble result. print('Ensembling result.') y_pred = ensemble.fit_predict(X, y, T[X.columns]) # write result. cu.write_result(y_pred)
def run_feature_outlier(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # transform feature 'yearbuilt' X['yearbuilt'] = 2016 - X['yearbuilt'] result = [] for feature in ['taxamount', 'yearbuilt']: for name, newSeries in generate_feature_replace_outlier( X[feature]).items(): print 'Try to deal with feature[%s] outlier by [%s].' % (feature, name) # get CV from train data. newX = X.copy() newX[feature] = newSeries X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) result.append([feature, name, xgbm.base_model.best_score]) print '\n'.join(','.join(str(o) for o in one) for one in result)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) y_mean, y_std = y.mean(), y.std() y -= y_mean y /= y_std # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) y_pred *= y_std y_pred += y_mean # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # read test data. T = cu.get_test_data(encode_non_object=True) # create base models. base_models = [ XGBoostModel(), LightGBMModel(), LinearRegressionModel(), RidgeModel(), LassoModel(), ElasticNetModel(), LassoLarsModel(), BayesianRidgeModel(), ] # setup ensemble parameters. ensemble = Ensemble(stacker=LinearRegressionModel(), base_models=base_models) # ensemble result. print('Ensembling result.') y_pred = ensemble.fit_predict(X, y, T[X.columns]) # write result. cu.write_result(y_pred)
def run_laglng_cluster(): # read train data. X, y = cu.get_train_data(encode_non_object=False) m_distances = [1500, 500, 50] min_sampleses = [1, 10, 50] result = [] for m_distance in m_distances: for min_samples in min_sampleses: print 'Run DBSCAN m_distance = %d, min_samples = %d.' % ( m_distance, min_samples) newX = preprocess_raw_latlng(X) coordinates = get_coordinates(newX) dbscan = cluster_latlng(coordinates, m_distance=m_distance, min_samples=min_samples) centroid_dict = get_centroid_dict(dbscan, coordinates) newX = replace_predict_cluster_df(dbscan, centroid_dict, newX) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) result.append( [m_distance, min_samples, xgbm.base_model.best_score]) print '\n'.join(','.join(str(o) for o in one) for one in result)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # MeanEncoder print('Use MeanEncoder.') mean_encoder = MeanEncoder(categorical_features=[ 'regionidcity', 'regionidneighborhood', 'regionidzip' ], target_type='regression') X = mean_encoder.fit_transform(X, pd.Series(y)) X = X.drop(mean_encoder.categorical_features, axis=1) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T = mean_encoder.transform(T) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def gen_zero_variance_features(): X, _ = cu.get_train_data(encode_non_object=False) X.fillna(X.median(), inplace=True) # IMPORTANT from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold() selector.fit(X) zero_variance_columns = [ col for i, col in enumerate(X.columns) if selector.variances_[i] == 0 ] return zero_variance_columns
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # train model. lrm = BayesianRidgeModel() lrm.train(X, y) # read test data. T = cu.get_test_data(encode_non_object=True) # predict result. print('Predicting.') y_pred = lrm.predict(T[X.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True) X = drop_columns(X) # train model. lrm = MLPRegressorModel() lrm.train(X, y) # read test data. T = cu.get_test_data(encode_non_object=True, standard_scaler_flag=True) # predict result. print('Predicting.') y_pred = lrm.predict(T[X.columns]) # write result. cu.write_result(y_pred)
def run(): def gridSearch(): st,nt,step=5,51,5 for a in range(st,nt,step): for b in range(st,nt,step): rlist = [] for c in range(st,nt,step): bindic = dict(zip(tarlist, [a, b, c])) X_trans = dt.getTransData(X, tarlist, bindic) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) rlist.append([a, b, c, xgbm.base_model.best_score]) with open('../../data/param.data','a') as outfile: for vs in rlist: outfile.write('\t'.join([str(v) for v in vs]) + '\n') # read train data. X, y = cu.get_train_data(encode_non_object=True) tarlist = X.columns X_trans, propdic = dt.getTransData(X, y, tarlist) for c in tarlist: X_trans[c] = X_trans[c].astype(float) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = dt.getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = xgbm.predict(T_trans[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. lgbmm = LightGBMModel() lgbmm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = lgbmm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def run_fe(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # feature utils from feature_utils import get_category_features, get_bool_features category_bool_columns = [] category_bool_columns.extend(get_category_features()) category_bool_columns.extend(get_bool_features()) print 'Drop category & bool columns: %s' % ','.join(category_bool_columns) X = X.drop(category_bool_columns, axis=1) # from sklearn.preprocessing import StandardScaler print 'Standard Scaler.' for col in X.columns: if col in category_bool_columns: continue col_mean, col_std = X[col].mean(), X[col].std() X[col] = (X[col] - col_mean) / col_std # train model. lrm = LinearRegressionModel() lrm.train(X, y, None, None)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) print('Transform, replace feature outliers.') X['yearbuilt'] = 2016 - X['yearbuilt'] yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt']) yearbuilt_median = X['yearbuilt'].median() taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount']) X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1, taxamount_q3) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T['yearbuilt'] = 2016 - T['yearbuilt'] T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1, taxamount_q3) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
} watchlist = [(d_train, 'train'), (d_valid, 'valid')] model = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=100, verbose_eval=1000) best = model.best_score return best # read train data. X, y = cu.get_train_data(encode_non_object=True) tarlist = X.columns X_trans, propdic = dt.getTransData(X, y, tarlist) for c in tarlist: X_trans[c] = X_trans[c].astype(float) # get CV from train data. rlist = [] tarcols = [ 'calculatedfinishedsquarefeet', ] fold = 10 for i in range(10): best = runTrain(X, y, fold, i, tarcols) rlist.append([i, best])
X.fillna(X.median(), inplace=True) # IMPORTANT from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold() selector.fit(X) zero_variance_columns = [ col for i, col in enumerate(X.columns) if selector.variances_[i] == 0 ] return zero_variance_columns if __name__ == '__main__': X, y = cu.get_train_data(encode_non_object=False) X = fillna_zero(X) print X.shape # feature importance print 'Generate feature importance.' print get_feature_importance_df() # missing rate print 'Missing rate.' missing_df = get_feature_missing_df(X) print missing_df print 'Missing rate >= 0.90' print get_features_by_missing_rate(missing_df, 0.90)