예제 #1
0
파일: features.py 프로젝트: chechir/million
def bad_model(df, train_ixs, feats):
    train = df.iloc[train_ixs, :]
    model = LinearRegression(fit_intercept=False,
                             normalize=True,
                             copy_X=True,
                             n_jobs=-1)
    model.fit(train[feats], train['logerror'])
    print nptools.get_mae_loss(train['logerror'].values,
                               model.predict(train[feats]))
    return model.predict(df[feats])
예제 #2
0
def optim_func(weights, preds, targets):
    final_prediction = ktools.ensemble_preds(preds, weights)
    score = 1000000 * tools.get_mae_loss(final_prediction, targets)
    return score
예제 #3
0
        cache_dir + 'ps2_test_2ndx{}_f{}.pkl'.format(n_models, n_folds))

    print('XGBoost... ')
    params = model_params.get_lvl2()
    dtrain = xgb.DMatrix(new_train.values, train_targets)
    dtest = xgb.DMatrix(new_test.values)
    preds_train_xgb = np.zeros(len(new_train))
    preds_test_xgb = np.zeros(len(new_test))
    n_bags = 5
    for i in range(n_bags):
        model = xgb.train(params, dtrain, num_boost_round=300, verbose_eval=2)
        preds_train_xgb += model.predict(dtrain)
        preds_test_xgb += model.predict(dtest)
    preds_train_xgb /= n_bags
    preds_test_xgb /= n_bags
    score = tools.get_mae_loss(train_targets, preds_train_xgb)
    print('train score:{}'.format(score))

    #  ############Keras
    print('nnet... ')
    x_train = new_train.values
    x_test = new_test.values
    model = model_params.get_lvl2nn(x_train.shape[1])
    batch_size = 256
    epochs = 10
    history = model.fit(x_train,
                        train_targets,
                        nb_epoch=epochs,
                        batch_size=batch_size)
    model.history = history
    preds_train_nn = model.predict(x_train).squeeze()
예제 #4
0
    df = data.select_features(df)
    df = df.drop(['assessmentyear'], axis=1)

    print df.columns
    if cv_flag:
        df_full_train, targets, df_test = data.split_data(df, logerror)
        df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio)

        cv_preds = np.repeat(0., len(df_val))
        for i in range(n_bags):
            x_train, x_val = tools.normalise_data(df_train.values, df_val.values)
            model = model_params.get_keras(x_train.shape[1])
            history = model.fit(
                    x_train, train_targets,
                    nb_epoch=epochs, batch_size=batch_size,
                            validation_data=(x_val, val_targets), verbose=2)
            model.history = history
            cv_preds += model.predict(x_val).squeeze()
        cv_preds /= float(n_bags)

        mae = tools.get_mae_loss(val_targets, cv_preds)
        mse = mean_squared_error(val_targets, cv_preds)
        msg = 'mae: {}, mse: {}, keras! train_data ratio: {}, bags:{}, epochs:{}'.format(mae, mse, cv_split_ratio, n_bags, epochs)
        print(msg), logger.debug(msg)

    else:
        print 'hola'
        ###training full:
        #data.generate_simple_kaggle_file(final_preds, 'bagged_{}'.format(n_bags))

예제 #5
0
            model = xgb.train(params,
                              dtrain,
                              num_boost_round=num_boost_rounds,
                              evals=watchlist,
                              early_stopping_rounds=50)
            cv_preds = model.predict(dtest) + cv_preds

            #prepare for the next iteration
            df_bag, bag_targets = delete_some_outliers(df_train, train_targets)
            dtrain = xgb.DMatrix(df_bag.values, bag_targets)
            print(i, df_bag.shape)
            #params['seed'] = i
            num_boost_rounds = 155
        cv_preds = cv_preds / n_bags

        mae = tools.get_mae_loss(test_targets, cv_preds)
        mse = mean_squared_error(test_targets, cv_preds)
        msg = 'mae: {}, mse: {}, train_data ratio: {}, bags:{}, r:{}'.format(
            mae, mse, cv_split_ratio, n_bags, num_boost_rounds)
        print(msg), logger.debug(msg)

    else:
        ###training full:
        df_train, targets, df_test = data.split_data(df, logerror)

        dtest = xgb.DMatrix(df_test.values)
        dtrain = xgb.DMatrix(df_train.values, targets)
        params = model_params.get_xtune11k()

        sub_preds = np.repeat(0, len(df_test))
        num_boost_rounds = 110
예제 #6
0
    df_train, targets, df_test = data.split_data(df, logerror)

    new_train = tools.read_pickle(
        cache_dir + 'ps_train_2ndx{}_f{}.pkl'.format(n_models, n_folds))
    new_test = tools.read_pickle(
        cache_dir + 'ps_test_2ndx{}_f{}.pkl'.format(n_models, n_folds))

    new_train0 = tools.read_pickle(
        cache_dir + 'ps_train_2ndx{}_f{}.pkl'.format(5, n_folds))
    new_test0 = tools.read_pickle(cache_dir +
                                  'ps_test_2ndx{}_f{}.pkl'.format(5, n_folds))
    new_train['cat_weird'] = new_train['cat_preds'] + new_train['ker_preds']
    new_test['cat_weird'] = new_test['cat_preds'] + new_test['ker_preds']

    print 'score cat', tools.get_mae_loss(targets, new_train['cat_preds'])
    print 'score xgb', tools.get_mae_loss(targets, new_train['xgb_preds'])
    print 'score lgb', tools.get_mae_loss(targets, new_train['lgb_preds'])
    print 'score keras', tools.get_mae_loss(targets, new_train['ker_preds'])
    print 'score cat2', tools.get_mae_loss(targets, new_train['cat2_preds'])
    print 'score cat3', tools.get_mae_loss(targets, new_train['cat3_preds'])
    print 'score cat4', tools.get_mae_loss(targets, new_train['cat4_preds'])
    print 'score cat weird', tools.get_mae_loss(targets,
                                                new_train['cat_weird'])

    train0 = np.zeros(len(new_train))
    test0 = np.zeros(len(new_test))

    #weirdness didnt work too well with 0.4, try 0.55
    #weights = [0.34, 0.02, 0.06, 0.08, 0.02, 0.08, 0.42, .02]  (best cv legal)
    #weights = [.55, .02, .07, 0.09, 0.02] (weird submission)
예제 #7
0
    if evaluate_cv:
        df, targets, train_ixs, test_ixs = data.get_cv_ixs(df, targets)
    else:
        train_ixs, test_ixs = data.get_lb_ixs(targets)

    df = features.add_features(df, train_ixs)
    df = data.select_features(df)
    print df.columns

    df_train, train_targets = df.iloc[train_ixs], targets[train_ixs]
    if evaluate_cv:
        df_test, test_targets = df.iloc[test_ixs], targets[test_ixs]
        eval_set = [(df_test.values, test_targets)]
    else:
        df_test = df.iloc[test_ixs]
        eval_set = [(df_train.values, train_targets)]

    params = model_params.get_ltune7k(num_rounds=nrounds)
    model = LGBMRegressor(**params)
    model.fit(df_train.values,
              train_targets,
              eval_set=eval_set,
              early_stopping_rounds=80)
    predictions = model.predict(df_test)
    if evaluate_cv:
        print(tools.get_mae_loss(test_targets, predictions))

    if not evaluate_cv:
        predictions = model.predict(df_test)
        data.generate_simple_kaggle_file(predictions, 'sub_singlelgb_quasies')
예제 #8
0
    df_test = df.iloc[test_ixs]

    new_train = ss.io.read_pickle(
        cache_dir + 'ps2_train_2ndx{}_f{}.pkl'.format(n_models, n_folds))
    new_test = ss.io.read_pickle(
        cache_dir + 'ps2_test_2ndx{}_f{}.pkl'.format(n_models, n_folds))
    cols = new_train.columns

    #    new_train['cat_weird'] = new_train['cat_preds'] + new_train['ker_preds']
    #    new_test['cat_weird'] = new_test['cat_preds'] + new_test['ker_preds']
    #    new_train['zero'] = np.repeat(0, len(new_train))
    #    new_test['zero'] = np.repeat(0, len(new_test))
    #    cols = ['cat_weird', 'zero'] + list(cols)

    for col in cols:
        print 'score {}'.format(col), tools.get_mae_loss(
            train_targets, new_train[col].values)

    new_train = new_train[cols]
    new_test = new_test[cols]

    init_weights = np.repeat(0.1, n_models)
    #    init_weights = [0.1 0.1, 0.1, 0.1, 0.06, 0.08, 0.02, 0.08, 0.42] # (best cv legal)

    all_train_preds = convert_preds_to_list(new_train)
    optim = optimise_weights(all_train_preds,
                             train_targets,
                             init_weights,
                             minimise=True)
    print "-", optim.fun
    optimised_weights = optim.x
예제 #9
0
    params = model_params.get_ctune163b()
    print df.columns
    if cv_flag:
        df_full_train, targets, df_test = data.split_data(df, logerror)
        df_train, df_test, train_targets, test_targets = data.split_cv(
            df_full_train, targets, cv_split_ratio)

        cv_preds = np.repeat(0, len(df_test))
        for i in range(n_bags):
            model = CatBoostRegressor(**params)
            eval_set = [df_test.values, test_targets]
            model.fit(df_train.values, train_targets, eval_set=eval_set)

            predictions = model.predict(df_test)
            mae = tools.get_mae_loss(test_targets, predictions)
            mse = mean_squared_error(test_targets, predictions)

            cv_preds = model.predict(df_test.values) + cv_preds

            #prepare for the next iteration
            #df_bag, bag_targets = delete_some_outliers(df_train, train_targets)
            #print(i, df_bag.shape)
            #params['seed'] = i
        cv_preds = cv_preds / n_bags

        mae = tools.get_mae_loss(test_targets, cv_preds)
        mse = mean_squared_error(test_targets, cv_preds)
        msg = 'mae: {}, mse: {}, train_data ratio: {}, bags:{}, r:{}'.format(
            mae, mse, cv_split_ratio, n_bags, params['iterations'])
        print(msg), logger.debug(msg)
예제 #10
0
    exc = [
        train.columns[c]
        for c in range(len(train.columns)) if train.dtypes[c] == 'O'
    ] + ['logerror', 'parcelid']
    col = [c for c in train.columns if c not in exc]

    train = reg_features(train[col])
    test[
        'transactiondate'] = '2016-01-01'  #should use the most common training date
    test = reg_features(test[col])

    reg = ElasticNetCV(normalize=True, l1_ratio=0.8, max_iter=5000)
    reg.fit(train, y)
    print('fit...')
    print(tools.get_mae_loss(targets, reg.predict(train)))

    ########################
    ########################
    ##  Combine and Save  ##
    ########################
    ########################

    ##### COMBINE PREDICTIONS

    print("\nCombining XGBoost, LightGBM, and baseline predicitons ...")
    lgb_weight = (1 - XGB_WEIGHT - BASELINE_WEIGHT) / float((1 - OLS_WEIGHT))
    xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
    baseline_weight0 = BASELINE_WEIGHT / (1 - OLS_WEIGHT)
    pred0 = xgb_weight0 * xgb_pred + baseline_weight0 * BASELINE_PRED + lgb_weight * p_test
예제 #11
0
        assert len(x_train_small) == len(x_train)

        # keras
        keras_ix = 2
        batch_size, epochs = 256, 15
        model = model_params.get_keras(x_train.shape[1])
        history = model.fit(x_train,
                            y_train,
                            nb_epoch=epochs,
                            batch_size=batch_size,
                            validation_data=(x_val, y_val),
                            verbose=2)
        model.history = history
        preds_train[val_ix, keras_ix] = model.predict(x_val).squeeze()
        preds_test[:, keras_ix] += model.predict(df_test.values).squeeze()
        score = tools.get_mae_loss(y_val, preds_train[val_ix, keras_ix])
        print('train rows:{}, val rows:{}, fold:{}, score:{}'.format(
            len(x_train), len(x_val), i, score))

        # svr !
        svr_ix = 0
        model = SVR(cache_size=600, C=0.1)
        print x_train_small.shape
        model.fit(x_train_small[::10, :], y_train[::10])
        preds_train[val_ix, svr_ix] = model.predict(x_val_small)
        preds_test[:, svr_ix] += model.predict(df_test_small.values)
        score = tools.get_mae_loss(y_val, preds_train[val_ix, svr_ix])
        print('train rows:{}, val rows:{}, fold:{}, score:{}'.format(
            len(x_train), len(x_val), i, score))

        # Catboost