def get_predictions(output): # Load campaign data data = pd.read_csv('campaigns.csv') # Trim data to only include desired output metric data = tra.trim(data, output) # Preprocess data without train/test or y/X splitting data, _ = pre.data_pipeline(data, output) # Load primary models direct_model = joblib.load('./models/' + output + '_model.pkl') direct_columns = joblib.load('./models/' + output + '_columns.pkl') cpx_model = \ joblib.load('./models/' + 'cost_per_' + output[0:-1] + '_model.pkl') cpx_columns = \ joblib.load('./models/' + 'cost_per_' + output[0:-1] + '_columns.pkl') # Calculate and save primary predictions predictions = pd.DataFrame(columns=[output, 'direct', 'cpx']) for index, row in data.iterrows(): direct_row = pd.DataFrame([dict(row)]) \ .reindex(columns=direct_columns, fill_value=0) cpx_row = pd.DataFrame([dict(row)]) \ .reindex(columns=cpx_columns, fill_value=0) direct_prediction = int(direct_model.predict(direct_row)[0]) cpx_prediction = int(row['cost'] / cpx_model.predict(cpx_row)[0]) predictions.loc[index] = [ row[output], direct_prediction, cpx_prediction ] return predictions
def train(output, update=False, upload=False, models=['linear', 'forest', 'svr']): """Complete training pipeline""" if update: postgres_to_csv() print('Data updated.') data = pd.read_csv('campaigns.csv') print('Data loaded.') data = trim(data, output) print('Data trimmed.') data, data_cat = pre.data_pipeline(data, output) [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output, encoded=True) [X_cat, y_cat, X_train_cat, y_train_cat, X_test_cat, y_test_cat] = \ pre.split_pipeline(data_cat, output, encoded=False) print('Data preprocessed.') regressors = build(X_train, y_train, X_train_scaled, y_train_scaled, X_train_cat, y_train_cat, models) best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler, X_train_cat, y_train_cat, X_test_cat, y_test_cat) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) elif 'Cat' in str(best_regressor): best_regressor.fit(X_cat, y_cat) else: best_regressor.fit(X, y) print('Regressor fit.') print_results(best_regressor, X, X_scaled, y, y_scaler, X_cat) save(best_regressor, X, output) print('Regressor saved.') if upload: upload_to_s3(output) print('Regressor uploaded.')
def train(output, source='pg', models=['linear', 'tree', 'forest', 'svr']): """Complete training pipeline""" if source == 'pg': data = load_data_from_postgres(output) print('Data loaded from Postgres.') elif source == 'csv': data = load_data_from_csv(output) print('Data loaded from CSV.') else: print('Source not available.') return data = pre.data_pipeline(data, output) [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output) print('Data preprocessed.') regressors = build(X_train, y_train, X_train_scaled, y_train_scaled, models) best_regressor = evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) else: best_regressor.fit(X, y) print('Regressor fit.') print_results(best_regressor, X, X_scaled, y, y_scaler) save(best_regressor, X, output) print('Regressor saved.') upload(output) print('Regressor uploaded.')
def calculate(output, model): """Determine feature importance for specified model""" data = pd.read_csv('campaigns.csv') print('Data loaded.') data = tra.trim(data, output) print('Data trimmed.') # Add random column to data np.random.seed(seed=0) data['random'] = np.random.random(size=len(data)) data, data_cat = pre.data_pipeline(data, output) [_, _, X_train, y_train, _, _, _, _, X_train_scaled, y_train_scaled, _, y_scaler] \ = pre.split_pipeline(data, output, encoded=True) [_, _, X_train_cat, y_train_cat, _, _] = \ pre.split_pipeline(data_cat, output, encoded=False) print('Data preprocessed.') regressor = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, X_train_cat, y_train_cat, [model])[0] model_clone = clone(regressor) # Set random_state for comparability model_clone.random_state = 0 # Train and score the benchmark model if 'SVR' in str(regressor): model_clone.fit(X_train_scaled, y_train_scaled) benchmark_score = hel.mean_relative_accuracy( y_scaler.inverse_transform(model_clone.predict(X_train_scaled)), y_train) else: model_clone.fit(X_train, y_train) benchmark_score = \ hel.mean_relative_accuracy(model_clone.predict(X_train), y_train) # Calculate and store feature importance benchmark deviation importances = [] columns = X_train.columns i = 1 for column in columns: model_clone = clone(regressor) model_clone.random_state = 0 if 'SVR' in str(regressor): model_clone.fit(X_train_scaled.drop(column, axis=1), y_train_scaled) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train_scaled.drop(column, axis=1)), y_train_scaled) else: model_clone.fit(X_train.drop(column, axis=1), y_train) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train.drop(column, axis=1)), y_train) importances.append(benchmark_score - drop_col_score) i += 1 importances_df = \ pd.DataFrame({'column': X_train.columns, 'value': importances}) \ .sort_values('value', ascending=False).reset_index(drop=True) print('Importances:') for i in range(0, len(importances_df)): print( str(importances_df.iloc[i].column) + ': ' + str(importances_df.iloc[i].value))