def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # train model. lrm = LinearRegressionModel() tarlist = [ c for c in X.columns if not c in 'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',') ] X_trans, propdic = getTransData(X, y, tarlist) x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y) lrm.train(x_train, y_train, None, None) y_pred = lrm.predict(x_holdout) score = abs(y_pred - y_holdout).mean() print(score) y_trans = [max([min([0.1, v]), -0.1]) for v in y] lrm.train(X_trans, y_trans, None, None) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = lrm.predict(T_trans[X_trans.columns].values) # write result. cu.write_result(y_pred) print(max(list(lrm.base_model.coef_))) print(min(y_pred))
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # read test data. T = cu.get_test_data(encode_non_object=True) # create base models. base_models = [ LinearRegressionModel(), XGBoostModel(), LightGBMModel() ] # setup ensemble parameters. ensemble = Ensemble( n_folds=10, stacker=LinearRegressionModel(), base_models=base_models ) # ensemble result. print('Ensembling result.') y_pred = ensemble.fit_predict(X, y, T[X.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # read test data. T = cu.get_test_data(encode_non_object=True) # create base models. base_models = [ XGBoostModel(), LightGBMModel(), LinearRegressionModel(), RidgeModel(), LassoModel(), ElasticNetModel(), LassoLarsModel(), BayesianRidgeModel(), ] # setup ensemble parameters. ensemble = Ensemble(stacker=LinearRegressionModel(), base_models=base_models) # ensemble result. print('Ensembling result.') y_pred = ensemble.fit_predict(X, y, T[X.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) y_mean, y_std = y.mean(), y.std() y -= y_mean y /= y_std # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) y_pred *= y_std y_pred += y_mean # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # MeanEncoder print('Use MeanEncoder.') mean_encoder = MeanEncoder(categorical_features=[ 'regionidcity', 'regionidneighborhood', 'regionidzip' ], target_type='regression') X = mean_encoder.fit_transform(X, pd.Series(y)) X = X.drop(mean_encoder.categorical_features, axis=1) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T = mean_encoder.transform(T) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # train model. lrm = BayesianRidgeModel() lrm.train(X, y) # read test data. T = cu.get_test_data(encode_non_object=True) # predict result. print('Predicting.') y_pred = lrm.predict(T[X.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True) X = drop_columns(X) # train model. lrm = MLPRegressorModel() lrm.train(X, y) # read test data. T = cu.get_test_data(encode_non_object=True, standard_scaler_flag=True) # predict result. print('Predicting.') y_pred = lrm.predict(T[X.columns]) # write result. cu.write_result(y_pred)
def run(): def gridSearch(): st,nt,step=5,51,5 for a in range(st,nt,step): for b in range(st,nt,step): rlist = [] for c in range(st,nt,step): bindic = dict(zip(tarlist, [a, b, c])) X_trans = dt.getTransData(X, tarlist, bindic) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) rlist.append([a, b, c, xgbm.base_model.best_score]) with open('../../data/param.data','a') as outfile: for vs in rlist: outfile.write('\t'.join([str(v) for v in vs]) + '\n') # read train data. X, y = cu.get_train_data(encode_non_object=True) tarlist = X.columns X_trans, propdic = dt.getTransData(X, y, tarlist) for c in tarlist: X_trans[c] = X_trans[c].astype(float) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = dt.getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = xgbm.predict(T_trans[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. lgbmm = LightGBMModel() lgbmm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = lgbmm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) print('Transform, replace feature outliers.') X['yearbuilt'] = 2016 - X['yearbuilt'] yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt']) yearbuilt_median = X['yearbuilt'].median() taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount']) X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1, taxamount_q3) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T['yearbuilt'] = 2016 - T['yearbuilt'] T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1, taxamount_q3) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
#0.052303568738 # For a mean squared error regression problem model.compile(optimizer='rmsprop', loss='mae') y_train_trim = [max([min([0.1, v]), -0.1]) for v in y_train] y_train_trim = np.array(y_train_trim) model.fit(x_train.values, y_train_trim, epochs=5, batch_size=32) y_pred = model.predict(x_holdout.values) y_pred = y_pred[:, 0] score = abs(y_pred - y_holdout).mean() print(score) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = model.predict(T_trans[X_trans.columns].values) # write result. cu.write_result(y_pred) #tmpdf = getTransTest(T.iloc[-100:,:], propdic) #print(tmpdf.shape) #for c in tmpdf.columns: # print(c) # print(tmpdf[c].value_counts()) #tmpy = model.predict(tmpdf[X_trans.columns].values) #print(tmpy) #print(tmpdf[X_trans.columns].values)