def ImputeVoteClassifier(self, data, target_name): print("*" * 100 + "\n") print("Start imputing missing values for feature: {} \n".format( target_name)) start = time.time() # Training set print("Generating training set...") train_data = data[data[target_name].notnull()].copy() train_target = train_data[target_name] train_data.drop(columns=[target_name], inplace=True) encoded_train = self.OnehotEncode( train_data, train_data.select_dtypes("category").columns) print("Done generating training set \n") # Testing set print("Generating testing set...") test_data = data[data[target_name].isnull()].copy() test_target = test_data[target_name] # Drop target var in testing set test_data.drop(columns=[target_name], inplace=True) encoded_test = self.OnehotEncode( test_data, test_data.select_dtypes("category").columns) print("Done generating testing set \n") # Fit data into base classifiers etc = make_copy(self.impute_etc) print("Fitting data into {}...".format(etc.__class__.__name__)) etc.fit(encoded_train, train_target) etc_pred = etc.predict(encoded_test) dtc = make_copy(self.impute_dtc) print("Fitting data into {}...".format(dtc.__class__.__name__)) dtc.fit(encoded_train, train_target) dtc_pred = dtc.predict(encoded_test) rfc = make_copy(self.impute_rfc) print("Fitting data into {}...".format(rfc.__class__.__name__)) rfc.fit(encoded_train, train_target) rfc_pred = rfc.predict(encoded_test) # Finalize data print("Voting final predictions...") final_pred = np.array([]) for i in range(0, len(test_target)): final_pred = np.append( final_pred, mode([etc_pred[i], dtc_pred[i], rfc_pred[i]])[0]) print( "Done voting and dumping final predictions into feature: {}. Time taken = {:.1f}(s) \n" .format(target_name, time.time() - start)) print("\n" + "*" * 100) return final_pred
def hold_out_predict(clf, X, y, cv): """Performing cross validation hold out predictions for stacking""" # Initilize n_classes = len( np.unique(y)) # Assuming that training data contains all classes meta_features = np.zeros((X.shape[0], n_classes)) n_splits = cv.get_n_splits(X, y) # Loop over folds print("Starting hold out prediction with {} splits for {}.".format( n_splits, clf.__class__.__name__)) cnt = 0 for train_idx, hold_out_idx in cv.split(X, y): # Split data X_train = X[train_idx] y_train = y[train_idx] X_hold_out = X[hold_out_idx] # Fit estimator to K-1 parts and predict on hold out part est = make_copy(clf) est.fit(X_train, y_train) y_hold_out_pred = est.predict_proba(X_hold_out) print("Loop nb " + str(cnt)) cnt += 1 # Fill in meta features meta_features[hold_out_idx] = y_hold_out_pred return meta_features
def __init__(self, base_estimators=[Regressor(strategy="XGBoost"), Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], level_estimator=LinearRegression(), n_folds=5, copy=False, random_state=1, verbose=True): self.base_estimators = base_estimators if(type(base_estimators) != list): raise ValueError("base_estimators must be a list") else: for i, est in enumerate(self.base_estimators): self.base_estimators[i] = make_copy(est) self.level_estimator = level_estimator self.n_folds = n_folds if(type(n_folds) != int): raise ValueError("n_folds must be an integer") self.copy = copy if(type(copy) != bool): raise ValueError("copy must be a boolean") self.random_state = random_state if((type(self.random_state) != int) and (self.random_state is not None)): raise ValueError("random_state must be either None or an integer") self.verbose = verbose if(type(self.verbose) != bool): raise ValueError("verbose must be a boolean") self.__fitOK = False self.__fittransformOK = False
def __init__(self, base_estimators = [Classifier(strategy="XGBoost"),Classifier(strategy="RandomForest"),Classifier(strategy="ExtraTrees")], level_estimator = LogisticRegression(n_jobs=-1), n_folds = 5, copy = False, drop_first = True, random_state = 1, verbose = True): self.base_estimators = base_estimators if(type(self.base_estimators)!=list): raise ValueError("base_estimators must be a list") else: for i, est in enumerate(self.base_estimators): self.base_estimators[i] = make_copy(est) self.level_estimator = level_estimator self.n_folds = n_folds if(type(self.n_folds)!=int): raise ValueError("n_folds must be an integer") self.copy = copy if(type(self.copy)!=bool): raise ValueError("copy must be a boolean") self.drop_first = drop_first if(type(self.drop_first)!=bool): raise ValueError("drop_first must be a boolean") self.random_state = random_state if((type(self.random_state)!=int)&(self.random_state!=None)): raise ValueError("random_state must be either None or an integer") self.verbose = verbose if(type(self.verbose)!=bool): raise ValueError("verbose must be a boolean") self.__fitOK = False self.__fittransformOK = False
def set_params(self, **params): self.__X_meta_test = None self.__X_meta_train = None self.__fittransformOK = False self.__transformOK = False if 'base_estimators' in params.keys(): self.base_estimators = params['base_estimators'] del params['base_estimators'] if type(self.base_estimators) != list: raise ValueError("base_estimators must be a list.") for i, est in enumerate(self.base_estimators): if type(est) == tuple: self.base_estimators[i] = est elif isinstance(est, Regressor) or isinstance(est, RegressorStacking): self.base_estimators[i] = make_copy(est) else: raise ValueError("Elements of base_estimators must be either Regressor, RegressorStacking or tuple.") if 'base_cv' in params.keys(): self.base_cv = params['base_cv'] del params['base_cv'] if 'base_scoring' in params.keys(): self.base_scoring = params['base_scoring'] del params['base_scoring'] if 'base_copy_idx' in params.keys(): self.base_copy_idx = params['base_copy_idx'] del params['base_copy_idx'] if type(self.base_copy_idx) != list and self.base_copy_idx is not None: raise ValueError("base_copy_idx must be either None or a list of integers.") if 'base_save' in params.keys(): self.base_save = params['base_save'] del params['base_save'] if type(self.base_save) != bool: raise ValueError("base_save must be a boolean.") if 'base_save_files' in params.keys(): self.base_save_files = params['base_save_files'] if type(self.base_save_files) != list and self.base_save_files is not None: raise ValueError("base_save_files must be either None or a list of tuples.") if self.base_save_files is not None: if len(self.base_save_file) != len(self.base_estimators): raise ValueError("base_save_files must be the same size as base_estimators.") if 'stacking_verbose' in params.keys(): self.stacking_verbose = params['stacking_verbose'] del params['stacking_verbose'] if type(self.stacking_verbose) != bool: raise ValueError("stacking_verbose must be a boolean.") super(RegressorStacking, self).set_params(**params)
def update(self, other: 'GraphElement') -> 'GraphElement': """Update those properties defined by other graph element. Undefined properties of the other graph element are not copied. Note that this function makes only a shallow copies of property values. """ assert type(self) is type(other) props = {} for name in other.defined(): value = other.property_value(name) props[name] = make_copy(value) self.props = self.props._replace(**props) return self
def feature_select(self, best_iteration, X_train, y_train, X_test, y_test): if "n_estimators" in self.model.get_params().keys(): self.model.set_params(n_estimators=best_iteration) # Create selection model selection_model = make_copy(self.model) # Fit selection model self.model.fit(X_train, y_train) # Dump feature importance df feature_importance = pd.DataFrame( self.model.feature_importances_, columns=["gain_score"], index=X_train.columns, ) feature_importance.to_excel("feature_importance.xlsx") # Feature selection loop thresholds = np.sort(self.model.feature_importances_)[ np.nonzero(np.sort(self.model.feature_importances_)) ][::-1] print( "Test model performance on original dataset with n_estimators = {} \n".format( self.model.get_params()["n_estimators"] ) ) for thresh in thresholds: selection = SelectFromModel(self.model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # Model defining selection_model.fit(select_X_train, y_train) # Model evaluation select_X_test = selection.transform(X_test) y_pred_select = selection_model.predict_proba(select_X_test)[:, 1] auc_select = metrics.roc_auc_score(y_test, y_pred_select) accuracy_select = metrics.accuracy_score( y_test, selection_model.predict(select_X_test) ) print( "Thresh={:.9f}, n={}, Accuracy: {:.2f}%, AUC: {:.2f}%".format( thresh, select_X_train.shape[1], accuracy_select * 100.0, auc_select * 100.0, ) ) pass
def data_merge(self, level_list, merge=True): print('Start getting interest data for training...') start = time.time() for level_dic in level_list: level_dic['data'], level_dic['fbid'] = self.get_interest( level_dic['link']) print( 'Done getting interest data for training. Time taken = {:.1f}(s) \n' .format(time.time() - start)) data_final = make_copy(level_list[0]['data']) dfs = [level_list[i]['data'] for i in range(1, 5)] if merge: for df in dfs: data_final = data_final.merge(df, on=['UID'], how='left') interest_strength = data_final.fillna(0) else: interest_strength = level_list[4]['data'].fillna(0) def level_convert(fbid): if fbid in set(level_list[4]['fbid']): return 5 else: if fbid in set(level_list[3]['fbid']): return 4 else: if fbid in set(level_list[2]['fbid']): return 3 else: if fbid in set(level_list[1]['fbid']): return 2 else: return 1 interest_strength['INTEREST_LEVEL'] = interest_strength['UID'].map( level_convert) # Get ids sum_ids = interest_strength.sum(axis=0) sum_ids[sum_ids != 0] ids = sum_ids[sum_ids != 0].index return interest_strength, ids
def __init__(self, modelname='Linear', base_estimators=[Regressor(modelname="Linear"), Regressor(modelname="RandomForest"), Regressor(modelname="ExtraTrees")], num_bagged_est=None, random_state=None, base_cv=None, base_scoring = None, base_copy_idx=None, base_save=False, base_save_files=None, stacking_verbose=True, **kwargs): """Construct a stacking regressor A stacking regressor is a regressor that uses the predictions of base layer estimators (generated with a cross validation method). Parameters ---------- modelname : str, model name to be used as stacking regressor Available models: - "XGBoost", - "LightGBM", - "Keras", - "RandomForest", - "ExtraTrees", - "Tree", - "Bagging", - "AdaBoost" - "Linear" num_bagged_est: int or None Number of estimators to be averaged after bagged fitting. If None then bagged fitting is not performed. random_state: int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by models. base_estimators : list of estimators objects/tuples List of estimators to fit in the stacking level using a cross validation. The items of list could be: - Regressor/RegressorStacking objects - tuples with hold out and test predictions of base estimators stacking_estimator : object, default = Regressor(modenlame="Linear") The estimator used in stacking level. base_cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy used in hold out predictions. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. base_scoring : callable, default: None A callable to evaluate the predictions on the cv set in hold out predictions. None, accuracy score base_copy_idx : list, default = None The list of original features added to meta features base_save : bool, default = False Saves hold out and test predictions of each base estimator to pickle files base_save_files : list of tuples, default = None File refs of saved hold out and test predictions random_state: int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by models. stacking_verbose : bool, default = True Verbose mode. **kwargs : default = None Parameters of the corresponding stacking regressor. Examples : n_estimators, max_depth, ... """ super(RegressorStacking, self).__init__(modelname, num_bagged_est=num_bagged_est, random_state=random_state, **kwargs) self.base_estimators = base_estimators if type(self.base_estimators) != list: raise ValueError("base_estimators must be a list.") for i, est in enumerate(self.base_estimators): if type(est) == tuple: self.base_estimators[i] = est elif isinstance(est, Regressor) or isinstance(est, RegressorStacking): self.base_estimators[i] = make_copy(est) else: raise ValueError("Elements of base_estimators must be either Regressor, RegressorStacking or tuple.") self.base_cv = base_cv self.base_scoring = base_scoring self.base_copy_idx = base_copy_idx if type(self.base_copy_idx) != list and self.base_copy_idx is not None: raise ValueError("base_copy_idx must be either None or a list of integers.") self.base_save = base_save if type(self.base_save) != bool: raise ValueError("base_save must be a boolean.") self.base_save_files = base_save_files if type(self.base_save_files) != list and self.base_save_files is not None: raise ValueError("base_save_files must be either None or a list of tuples.") if self.base_save_files is not None: if len(self.base_save_file) != len(self.base_estimators): raise ValueError("base_save_files must be the same size as base_estimators.") self.stacking_verbose = stacking_verbose if type(self.stacking_verbose) != bool: raise ValueError("stacking_verbose must be a boolean.") self.__X_meta_test = None self.__X_meta_train = None self.__fittransformOK = False self.__transformOK = False
def get_estimator_copy(self): return make_copy(self.__classifier)