def train(output, update=False, upload=False, models=['linear', 'forest', 'svr']): """Complete training pipeline""" if update: postgres_to_csv() print('Data updated.') data = pd.read_csv('campaigns.csv') print('Data loaded.') data = trim(data, output) print('Data trimmed.') data, data_cat = pre.data_pipeline(data, output) [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output, encoded=True) [X_cat, y_cat, X_train_cat, y_train_cat, X_test_cat, y_test_cat] = \ pre.split_pipeline(data_cat, output, encoded=False) print('Data preprocessed.') regressors = build(X_train, y_train, X_train_scaled, y_train_scaled, X_train_cat, y_train_cat, models) best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler, X_train_cat, y_train_cat, X_test_cat, y_test_cat) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) elif 'Cat' in str(best_regressor): best_regressor.fit(X_cat, y_cat) else: best_regressor.fit(X, y) print('Regressor fit.') print_results(best_regressor, X, X_scaled, y, y_scaler, X_cat) save(best_regressor, X, output) print('Regressor saved.') if upload: upload_to_s3(output) print('Regressor uploaded.')
def train(output, models=['linear', 'tree', 'forest', 'svr', 'cat']): data = get_predictions(output) print('Primary predictions loaded.') [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output) print('Data preprocessed.') regressors = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, X_train, y_train, models) best_regressor = tra.evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler, X_train, y_train, X_test, y_test) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) else: best_regressor.fit(X, y) print('Regressor fit.') tra.print_results(best_regressor, X, X_scaled, y, y_scaler, X) tra.save(best_regressor, X, output + '_transfer') print('Regressor saved.') tra.upload(output + '_transfer') print('Regressor uploaded.')
def train(output, source='pg', models=['linear', 'tree', 'forest', 'svr']): """Complete training pipeline""" if source == 'pg': data = load_data_from_postgres(output) print('Data loaded from Postgres.') elif source == 'csv': data = load_data_from_csv(output) print('Data loaded from CSV.') else: print('Source not available.') return data = pre.data_pipeline(data, output) [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output) print('Data preprocessed.') regressors = build(X_train, y_train, X_train_scaled, y_train_scaled, models) best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) else: best_regressor.fit(X, y) print('Regressor fit.') print_results(best_regressor, X, X_scaled, y, y_scaler) save(best_regressor, X, output) print('Regressor saved.') upload(output) print('Regressor uploaded.')
def calculate(output, model): """Determine feature importance for specified model""" data = pd.read_csv('campaigns.csv') print('Data loaded.') data = tra.trim(data, output) print('Data trimmed.') # Add random column to data np.random.seed(seed=0) data['random'] = np.random.random(size=len(data)) data, data_cat = pre.data_pipeline(data, output) [_, _, X_train, y_train, _, _, _, _, X_train_scaled, y_train_scaled, _, y_scaler] \ = pre.split_pipeline(data, output, encoded=True) [_, _, X_train_cat, y_train_cat, _, _] = \ pre.split_pipeline(data_cat, output, encoded=False) print('Data preprocessed.') regressor = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, X_train_cat, y_train_cat, [model])[0] model_clone = clone(regressor) # Set random_state for comparability model_clone.random_state = 0 # Train and score the benchmark model if 'SVR' in str(regressor): model_clone.fit(X_train_scaled, y_train_scaled) benchmark_score = hel.mean_relative_accuracy( y_scaler.inverse_transform(model_clone.predict(X_train_scaled)), y_train) else: model_clone.fit(X_train, y_train) benchmark_score = \ hel.mean_relative_accuracy(model_clone.predict(X_train), y_train) # Calculate and store feature importance benchmark deviation importances = [] columns = X_train.columns i = 1 for column in columns: model_clone = clone(regressor) model_clone.random_state = 0 if 'SVR' in str(regressor): model_clone.fit(X_train_scaled.drop(column, axis=1), y_train_scaled) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train_scaled.drop(column, axis=1)), y_train_scaled) else: model_clone.fit(X_train.drop(column, axis=1), y_train) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train.drop(column, axis=1)), y_train) importances.append(benchmark_score - drop_col_score) i += 1 importances_df = \ pd.DataFrame({'column': X_train.columns, 'value': importances}) \ .sort_values('value', ascending=False).reset_index(drop=True) print('Importances:') for i in range(0, len(importances_df)): print( str(importances_df.iloc[i].column) + ': ' + str(importances_df.iloc[i].value))