Пример #1
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    # train model.
    lrm = LinearRegressionModel()

    tarlist = [
        c for c in X.columns if not c in
        'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',')
    ]

    X_trans, propdic = getTransData(X, y, tarlist)
    x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y)
    lrm.train(x_train, y_train, None, None)
    y_pred = lrm.predict(x_holdout)

    score = abs(y_pred - y_holdout).mean()
    print(score)

    y_trans = [max([min([0.1, v]), -0.1]) for v in y]
    lrm.train(X_trans, y_trans, None, None)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T_trans[X_trans.columns].values)

    # write result.
    cu.write_result(y_pred)
    print(max(list(lrm.base_model.coef_)))
    print(min(y_pred))
Пример #2
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # create base models.
    base_models = [
        LinearRegressionModel(),
        XGBoostModel(),
        LightGBMModel()
    ]

    # setup ensemble parameters.
    ensemble = Ensemble(
        n_folds=10,
        stacker=LinearRegressionModel(),
        base_models=base_models
    )

    # ensemble result.
    print('Ensembling result.')
    y_pred = ensemble.fit_predict(X, y, T[X.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # create base models.
    base_models = [
        XGBoostModel(),
        LightGBMModel(),
        LinearRegressionModel(),
        RidgeModel(),
        LassoModel(),
        ElasticNetModel(),
        LassoLarsModel(),
        BayesianRidgeModel(),
    ]

    # setup ensemble parameters.
    ensemble = Ensemble(stacker=LinearRegressionModel(),
                        base_models=base_models)

    # ensemble result.
    print('Ensembling result.')
    y_pred = ensemble.fit_predict(X, y, T[X.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)
    y_mean, y_std = y.mean(), y.std()
    y -= y_mean
    y /= y_std

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])
    y_pred *= y_std
    y_pred += y_mean

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # MeanEncoder
    print('Use MeanEncoder.')
    mean_encoder = MeanEncoder(categorical_features=[
        'regionidcity', 'regionidneighborhood', 'regionidzip'
    ],
                               target_type='regression')

    X = mean_encoder.fit_transform(X, pd.Series(y))
    X = X.drop(mean_encoder.categorical_features, axis=1)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T = mean_encoder.transform(T)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # train model.
    lrm = BayesianRidgeModel()
    lrm.train(X, y)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T[X.columns])

    # write result.
    cu.write_result(y_pred)
Пример #7
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True)
    X = drop_columns(X)

    # train model.
    lrm = MLPRegressorModel()
    lrm.train(X, y)

    # read test data.
    T = cu.get_test_data(encode_non_object=True, standard_scaler_flag=True)

    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T[X.columns])

    # write result.
    cu.write_result(y_pred)
Пример #8
0
def run():
    def gridSearch():
        st,nt,step=5,51,5
        for a in range(st,nt,step):
            for b in range(st,nt,step):
                rlist = []
                for c in range(st,nt,step):
                    bindic = dict(zip(tarlist, [a, b, c]))
                    X_trans = dt.getTransData(X, tarlist, bindic)
                    # get CV from train data.
                    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)
                
                    # train model.
                    xgbm = XGBoostModel()
                    xgbm.train(X_train, y_train, X_holdout, y_holdout)
                    rlist.append([a, b, c, xgbm.base_model.best_score])
                
                with open('../../data/param.data','a') as outfile:
                    for vs in rlist:
                        outfile.write('\t'.join([str(v) for v in vs]) + '\n')
    
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    tarlist = X.columns
    X_trans, propdic = dt.getTransData(X, y, tarlist)
    
    for c in tarlist:
        X_trans[c] = X_trans[c].astype(float)
    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)
    
    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = dt.getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T_trans[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    lgbmm = LightGBMModel()
    lgbmm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = lgbmm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    print('Transform, replace feature outliers.')
    X['yearbuilt'] = 2016 - X['yearbuilt']

    yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt'])
    yearbuilt_median = X['yearbuilt'].median()
    taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount'])

    X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T['yearbuilt'] = 2016 - T['yearbuilt']
    T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
Пример #11
0
#0.052303568738
# For a mean squared error regression problem
model.compile(optimizer='rmsprop', loss='mae')

y_train_trim = [max([min([0.1, v]), -0.1]) for v in y_train]
y_train_trim = np.array(y_train_trim)
model.fit(x_train.values, y_train_trim, epochs=5, batch_size=32)

y_pred = model.predict(x_holdout.values)
y_pred = y_pred[:, 0]

score = abs(y_pred - y_holdout).mean()
print(score)

# read test data.
T = cu.get_test_data(encode_non_object=True)
T_trans = getTransTest(T, propdic)
# predict result.
print('Predicting.')
y_pred = model.predict(T_trans[X_trans.columns].values)

# write result.
cu.write_result(y_pred)
#tmpdf = getTransTest(T.iloc[-100:,:], propdic)
#print(tmpdf.shape)
#for c in tmpdf.columns:
#    print(c)
#    print(tmpdf[c].value_counts())
#tmpy = model.predict(tmpdf[X_trans.columns].values)
#print(tmpy)
#print(tmpdf[X_trans.columns].values)