Пример #1
0
def get_predictions(output):
    # Load campaign data
    data = pd.read_csv('campaigns.csv')

    # Trim data to only include desired output metric
    data = tra.trim(data, output)

    # Preprocess data without train/test or y/X splitting
    data, _ = pre.data_pipeline(data, output)

    # Load primary models
    direct_model = joblib.load('./models/' + output + '_model.pkl')
    direct_columns = joblib.load('./models/' + output + '_columns.pkl')
    cpx_model = \
        joblib.load('./models/' + 'cost_per_' + output[0:-1] + '_model.pkl')
    cpx_columns = \
        joblib.load('./models/' + 'cost_per_' + output[0:-1] + '_columns.pkl')

    # Calculate and save primary predictions
    predictions = pd.DataFrame(columns=[output, 'direct', 'cpx'])
    for index, row in data.iterrows():
        direct_row = pd.DataFrame([dict(row)]) \
                       .reindex(columns=direct_columns, fill_value=0)
        cpx_row = pd.DataFrame([dict(row)]) \
                    .reindex(columns=cpx_columns, fill_value=0)
        direct_prediction = int(direct_model.predict(direct_row)[0])
        cpx_prediction = int(row['cost'] / cpx_model.predict(cpx_row)[0])
        predictions.loc[index] = [
            row[output], direct_prediction, cpx_prediction
        ]

    return predictions
Пример #2
0
def train(output,
          update=False,
          upload=False,
          models=['linear', 'forest', 'svr']):
    """Complete training pipeline"""

    if update:
        postgres_to_csv()
        print('Data updated.')

    data = pd.read_csv('campaigns.csv')
    print('Data loaded.')

    data = trim(data, output)
    print('Data trimmed.')

    data, data_cat = pre.data_pipeline(data, output)
    [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled,
     X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \
        = pre.split_pipeline(data, output, encoded=True)
    [X_cat, y_cat, X_train_cat, y_train_cat, X_test_cat, y_test_cat] = \
        pre.split_pipeline(data_cat, output, encoded=False)
    print('Data preprocessed.')

    regressors = build(X_train, y_train, X_train_scaled, y_train_scaled,
                       X_train_cat, y_train_cat, models)

    best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled,
                              y_train_scaled, X_test, y_test, X_test_scaled,
                              y_scaler, X_train_cat, y_train_cat, X_test_cat,
                              y_test_cat)
    print('Regressors evaluated. Best regressor is:\n' + str(best_regressor))

    if 'SVR' in str(best_regressor):
        best_regressor.fit(X_scaled, y_scaled)
    elif 'Cat' in str(best_regressor):
        best_regressor.fit(X_cat, y_cat)
    else:
        best_regressor.fit(X, y)
    print('Regressor fit.')

    print_results(best_regressor, X, X_scaled, y, y_scaler, X_cat)

    save(best_regressor, X, output)
    print('Regressor saved.')

    if upload:
        upload_to_s3(output)
        print('Regressor uploaded.')
Пример #3
0
def train(output, source='pg', models=['linear', 'tree', 'forest', 'svr']):
    """Complete training pipeline"""

    if source == 'pg':
        data = load_data_from_postgres(output)
        print('Data loaded from Postgres.')
    elif source == 'csv':
        data = load_data_from_csv(output)
        print('Data loaded from CSV.')
    else:
        print('Source not available.')
        return

    data = pre.data_pipeline(data, output)
    [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled,
     X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \
        = pre.split_pipeline(data, output)
    print('Data preprocessed.')

    regressors = build(X_train, y_train, X_train_scaled, y_train_scaled,
                       models)

    best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled,
                              y_train_scaled, X_test, y_test, X_test_scaled,
                              y_scaler)
    print('Regressors evaluated. Best regressor is:\n' + str(best_regressor))

    if 'SVR' in str(best_regressor):
        best_regressor.fit(X_scaled, y_scaled)
    else:
        best_regressor.fit(X, y)
    print('Regressor fit.')

    print_results(best_regressor, X, X_scaled, y, y_scaler)

    save(best_regressor, X, output)
    print('Regressor saved.')

    upload(output)
    print('Regressor uploaded.')
Пример #4
0
def calculate(output, model):
    """Determine feature importance for specified model"""

    data = pd.read_csv('campaigns.csv')
    print('Data loaded.')

    data = tra.trim(data, output)
    print('Data trimmed.')

    # Add random column to data
    np.random.seed(seed=0)
    data['random'] = np.random.random(size=len(data))

    data, data_cat = pre.data_pipeline(data, output)
    [_, _, X_train, y_train, _, _, _, _,
     X_train_scaled, y_train_scaled, _, y_scaler] \
        = pre.split_pipeline(data, output, encoded=True)
    [_, _, X_train_cat, y_train_cat, _, _] = \
        pre.split_pipeline(data_cat, output, encoded=False)
    print('Data preprocessed.')

    regressor = tra.build(X_train, y_train, X_train_scaled, y_train_scaled,
                          X_train_cat, y_train_cat, [model])[0]

    model_clone = clone(regressor)

    # Set random_state for comparability
    model_clone.random_state = 0

    # Train and score the benchmark model
    if 'SVR' in str(regressor):
        model_clone.fit(X_train_scaled, y_train_scaled)
        benchmark_score = hel.mean_relative_accuracy(
            y_scaler.inverse_transform(model_clone.predict(X_train_scaled)),
            y_train)
    else:
        model_clone.fit(X_train, y_train)
        benchmark_score = \
            hel.mean_relative_accuracy(model_clone.predict(X_train), y_train)

    # Calculate and store feature importance benchmark deviation
    importances = []
    columns = X_train.columns
    i = 1
    for column in columns:
        model_clone = clone(regressor)
        model_clone.random_state = 0
        if 'SVR' in str(regressor):
            model_clone.fit(X_train_scaled.drop(column, axis=1),
                            y_train_scaled)
            drop_col_score = hel.mean_relative_accuracy(
                model_clone.predict(X_train_scaled.drop(column, axis=1)),
                y_train_scaled)
        else:
            model_clone.fit(X_train.drop(column, axis=1), y_train)
            drop_col_score = hel.mean_relative_accuracy(
                model_clone.predict(X_train.drop(column, axis=1)), y_train)
        importances.append(benchmark_score - drop_col_score)
        i += 1

    importances_df = \
        pd.DataFrame({'column': X_train.columns, 'value': importances}) \
          .sort_values('value', ascending=False).reset_index(drop=True)

    print('Importances:')
    for i in range(0, len(importances_df)):
        print(
            str(importances_df.iloc[i].column) + ': ' +
            str(importances_df.iloc[i].value))