Пример #1
0
    df_train, df_test = data.load_data(cache=True)
    df = data.create_fulldf(df_train, df_test)

    df = df.fillna(NULL_VALUE)
    df = data.clean_data(df)
    df = data.encode_labels(df)
    #df = features.add_features(df)

    logerror = df['logerror'].values
    targets = logerror
    df = data.select_features(df)
    df = df.drop(['assessmentyear'], axis=1)

    print df.columns
    if cv_flag:
        df_full_train, targets, df_test = data.split_data(df, logerror)
        df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio)

        cv_preds = np.repeat(0., len(df_val))
        for i in range(n_bags):
            x_train, x_val = tools.normalise_data(df_train.values, df_val.values)
            model = model_params.get_keras(x_train.shape[1])
            history = model.fit(
                    x_train, train_targets,
                    nb_epoch=epochs, batch_size=batch_size,
                            validation_data=(x_val, val_targets), verbose=2)
            model.history = history
            cv_preds += model.predict(x_val).squeeze()
        cv_preds /= float(n_bags)

        mae = tools.get_mae_loss(val_targets, cv_preds)
Пример #2
0
    return predictions

def predict_multiple_months(df_test, predict_func):
    sub = pd.read_csv('../input/sample_submission.csv')
    df = df_test.copy()
    for c in sub.columns[sub.columns != 'ParcelId']:
        df['transaction_month'] = np.repeat(c[4:6], len(df))
        df['transaction_year'] = np.repeat(c[0:4], len(df))
        predictions = predict_func(model, df.values)
        print 'predicting for ' + c + ' ' + str(predictions.sum())
        sub[c] = predictions
    return sub

if __name__ == '__main__':
    df_train, df_test = data.load_data()
    1/0
    df = data.create_fulldf(df_train, df_test)
    df = df.fillna(NULL_VALUE)
    df = data.clean_data(df)
    df = data.encode_labels(df)
    df = features.add_features(df)
    #df = data.add_month_and_year(df)
    targets = df['logerror'].values
    df = data.select_features(df)
    df_train, targets, df_test = data.split_data(df, targets)
    model = train_xgb_cv(df_train, targets)

    sub = predict_multiple_months(df_test, predict_xgb)
    data.generate_kaggle_file(sub, 'sub/xgb_try_exper_quasi.csv')