Exemplo n.º 1
0
def train(output, models=['linear', 'tree', 'forest', 'svr', 'cat']):
    data = get_predictions(output)
    print('Primary predictions loaded.')

    [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled,
     X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \
        = pre.split_pipeline(data, output)
    print('Data preprocessed.')

    regressors = tra.build(X_train, y_train, X_train_scaled, y_train_scaled,
                           X_train, y_train, models)
    best_regressor = tra.evaluate(regressors, X_train, y_train, X_train_scaled,
                                  y_train_scaled, X_test, y_test,
                                  X_test_scaled, y_scaler, X_train, y_train,
                                  X_test, y_test)
    print('Regressors evaluated. Best regressor is:\n' + str(best_regressor))

    if 'SVR' in str(best_regressor):
        best_regressor.fit(X_scaled, y_scaled)
    else:
        best_regressor.fit(X, y)
    print('Regressor fit.')

    tra.print_results(best_regressor, X, X_scaled, y, y_scaler, X)

    tra.save(best_regressor, X, output + '_transfer')
    print('Regressor saved.')

    tra.upload(output + '_transfer')
    print('Regressor uploaded.')
Exemplo n.º 2
0
def calculate(output, model):
    """Determine feature importance for specified model"""

    data = pd.read_csv('campaigns.csv')
    print('Data loaded.')

    data = tra.trim(data, output)
    print('Data trimmed.')

    # Add random column to data
    np.random.seed(seed=0)
    data['random'] = np.random.random(size=len(data))

    data, data_cat = pre.data_pipeline(data, output)
    [_, _, X_train, y_train, _, _, _, _,
     X_train_scaled, y_train_scaled, _, y_scaler] \
        = pre.split_pipeline(data, output, encoded=True)
    [_, _, X_train_cat, y_train_cat, _, _] = \
        pre.split_pipeline(data_cat, output, encoded=False)
    print('Data preprocessed.')

    regressor = tra.build(X_train, y_train, X_train_scaled, y_train_scaled,
                          X_train_cat, y_train_cat, [model])[0]

    model_clone = clone(regressor)

    # Set random_state for comparability
    model_clone.random_state = 0

    # Train and score the benchmark model
    if 'SVR' in str(regressor):
        model_clone.fit(X_train_scaled, y_train_scaled)
        benchmark_score = hel.mean_relative_accuracy(
            y_scaler.inverse_transform(model_clone.predict(X_train_scaled)),
            y_train)
    else:
        model_clone.fit(X_train, y_train)
        benchmark_score = \
            hel.mean_relative_accuracy(model_clone.predict(X_train), y_train)

    # Calculate and store feature importance benchmark deviation
    importances = []
    columns = X_train.columns
    i = 1
    for column in columns:
        model_clone = clone(regressor)
        model_clone.random_state = 0
        if 'SVR' in str(regressor):
            model_clone.fit(X_train_scaled.drop(column, axis=1),
                            y_train_scaled)
            drop_col_score = hel.mean_relative_accuracy(
                model_clone.predict(X_train_scaled.drop(column, axis=1)),
                y_train_scaled)
        else:
            model_clone.fit(X_train.drop(column, axis=1), y_train)
            drop_col_score = hel.mean_relative_accuracy(
                model_clone.predict(X_train.drop(column, axis=1)), y_train)
        importances.append(benchmark_score - drop_col_score)
        i += 1

    importances_df = \
        pd.DataFrame({'column': X_train.columns, 'value': importances}) \
          .sort_values('value', ascending=False).reset_index(drop=True)

    print('Importances:')
    for i in range(0, len(importances_df)):
        print(
            str(importances_df.iloc[i].column) + ': ' +
            str(importances_df.iloc[i].value))