forked from GodelBose/Stacker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stacker.py
138 lines (116 loc) · 5.67 KB
/
stacker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import xgboost as xgb
import numpy as np
from base_model import BaseModel
import helper
from feature_builder import FeatureBuilder
import pandas as pd
class Stacker:
def __init__(self, meta_model, base_models, num_splits, feature_builder, meta_model_params='', base_model_params=''):
'''Initializes a meta stacking model:
-----------
meta_model: str
Name of the meta model to be used. Follows a strict convention of TYPE-NAME
where TYPE must be either "c" for classification or "r" for regression and NAME represents either
xgb for XGBoost, rf for randomforest or dt for decisiontrees
params_path: str
Filepath to the parameter file that either sets the bounds for random parameter initializations or to load specific params
num_splits: int
The amount of splits that have to be created to create the meta training data.
feature_builder: FeatureBuilder
Instance of FeatureBuilder class already initialized with all feature functions.
meta_model_params: str
Path to the params file of the meta model
base_model_params: str
Path to the params directory of the base models
Returns:
--------
-
'''
self.meta_model = BaseModel(meta_model, meta_model_params)
self.base_models = [BaseModel(model, params) for model,params in zip(base_models, base_model_params)]
self.num_splits = num_splits
self.feature_builder = feature_builder
def generate_base_model_predictions(self, X, y, df=None):
'''Split the training data and create predictions for each model to create the complete meta training data.
-----------
X: numpy.array
Data matrix
y:
labels
df: pandas.DataFrame
Raw DataFrame to be used if historical features need to be created.
Returns:
--------
Meta training data that contains all base model predictions for each training instance
'''
model_predictions = []
for model in self.base_models:
is_xgb = True if 'xgb' in model.name else False
is_c = True if 'c' == model.name[0] else False
split_predictions = []
it = helper.split_train_validation_data(X,y,self.num_splits)
for X_train, y_train, X_valid, y_valid, index1, index2 in it:
if not isinstance(df, pd.DataFrame):
model.fit(X_train, y_train)
if is_xgb or not is_c:
split_predictions.append(model.predict(X_valid))
else:
split_predictions.append(model.predict_proba(X_valid))
else:
historical_df = df[index1:index2]
df_temp = pd.concat([df[:index1], df[index2:]])
historical_features = self.feature_builder.create_historical_features(df, historical_df)
train_historical_features = np.concatenate([historical_features[:index1], historical_features[index2:]], axis=0)
validation_historical_features = historical_features[index1:index2]
X_train = np.hstack([X_train, train_historical_features])
X_valid = np.hstack([X_valid, validation_historical_features])
model.fit(X_train, y_train)
if is_xgb or not is_c:
split_predictions.append(model.predict(X_valid))
else:
split_predictions.append(model.predict_proba(X_valid))
model_predictions.append(np.vstack(split_predictions))
return np.hstack(model_predictions)
def generate_new_base_model_predictions(self, X, df, historical_df):
model_predictions = []
for model in self.base_models:
is_xgb = True if 'xgb' in model.name else False
is_c = True if 'c' == model.name[0] else False
if isinstance(df, pd.DataFrame):
historical_features = self.feature_builder.create_historical_features(df, historical_df)
X_temp = np.hstack([X, historical_features])
if is_xgb or not is_c:
model_predictions.append(model.predict(X_temp))
else:
model_predictions.append(model.predict_proba(X_temp))
return np.hstack(model_predictions)
def fit(self, X, y, df=None):
'''Fit the meta model on the predictions made by all base models.
-----------
X: numpy.array
Data matrix
y: numpy.array
labels
df: pd.DataFrame
Raw DataFrame to be used for creating the features that have to be created using historical knowledge.
Returns:
--------
Meta training data that contains all base model predictions for each training instance
'''
X_train_meta = self.generate_base_model_predictions(X, y, df=df)
self.meta_model.fit(X_train_meta, y)
def predict(self, X, df=None, historical_df=None):
'''Predict with the meta model on the predictions made by all base models.
-----------
X: numpy.array
Data matrix
df: pd.DataFrame
Raw DataFrame to be used for creating the features that have to be created using historical knowledge.
historical_df: pd.DataFrame
Raw DataFrame containing the historical knowledge to create the predictions.
Returns:
--------
Meta training data that contains all base model predictions for each training instance
'''
X_train_meta = self.generate_new_base_model_predictions(X, df, historical_df)
return self.meta_model.predict(X_train_meta)