def main(): # Data Extraction df = data_extract_e('e_20190609_15.pkl') # Data Transformation and Engineering df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder = load_labels('dept_encoder.pkl', 'queue_encoder.pkl', df=df) df = feature_transform(df, dept_encoder=dept_encoder, queue_encoder=queue_encoder) # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=1357) # 2468 to use same shuffle as individual models # Load models from persistent files models = load_models() print(models) # Stacking # Produces a new set of features based on the predictions of base models x_train_s, x_test_s = stacking(models, x_train, y_train, x_test, n_folds=10, shuffle=True, verbose=0, regression=True) save_data(x_train_s, 'x_train_s.pkl') save_data(y_train, 'y_train.pkl') save_data(x_test_s, 'x_test_s.pkl') save_data(y_test, 'y_test.pkl')
import pandas as pd import numpy as np from ai_cloud_etl import data_extract_e, data_filter, feature_eng, feature_transform, extract_queues, fit_labels from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score from tpot import TPOTRegressor import xgb_config # Data Extraction df = data_extract_e('e_20190609_15.pkl') # Data Transformation and Engineering df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder = fit_labels(df) df = feature_transform(df, queue_encoder, dept_encoder) # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2468) # Using TPOT AutoML tpot = TPOTRegressor(n_jobs=-1, verbosity=1, config_dict=xgb_config.xgb_config_dict) tpot = tpot.fit(x_train, y_train) y_pred = tpot.predict(x_train) print('XGB TPOT training R2 score: ', r2_score(y_train, y_pred))