예제 #1
0
    def ImputeVoteClassifier(self, data, target_name):
        print("*" * 100 + "\n")
        print("Start imputing missing values for feature: {} \n".format(
            target_name))
        start = time.time()
        # Training set
        print("Generating training set...")
        train_data = data[data[target_name].notnull()].copy()
        train_target = train_data[target_name]
        train_data.drop(columns=[target_name], inplace=True)
        encoded_train = self.OnehotEncode(
            train_data,
            train_data.select_dtypes("category").columns)
        print("Done generating training set \n")
        # Testing set
        print("Generating testing set...")
        test_data = data[data[target_name].isnull()].copy()
        test_target = test_data[target_name]
        # Drop target var in testing set
        test_data.drop(columns=[target_name], inplace=True)
        encoded_test = self.OnehotEncode(
            test_data,
            test_data.select_dtypes("category").columns)
        print("Done generating testing set \n")
        # Fit data into base classifiers
        etc = make_copy(self.impute_etc)
        print("Fitting data into {}...".format(etc.__class__.__name__))
        etc.fit(encoded_train, train_target)
        etc_pred = etc.predict(encoded_test)

        dtc = make_copy(self.impute_dtc)
        print("Fitting data into {}...".format(dtc.__class__.__name__))
        dtc.fit(encoded_train, train_target)
        dtc_pred = dtc.predict(encoded_test)

        rfc = make_copy(self.impute_rfc)
        print("Fitting data into {}...".format(rfc.__class__.__name__))
        rfc.fit(encoded_train, train_target)
        rfc_pred = rfc.predict(encoded_test)

        # Finalize data
        print("Voting final predictions...")
        final_pred = np.array([])
        for i in range(0, len(test_target)):
            final_pred = np.append(
                final_pred,
                mode([etc_pred[i], dtc_pred[i], rfc_pred[i]])[0])
        print(
            "Done voting and dumping final predictions into feature: {}. Time taken = {:.1f}(s) \n"
            .format(target_name,
                    time.time() - start))
        print("\n" + "*" * 100)
        return final_pred
예제 #2
0
    def hold_out_predict(clf, X, y, cv):
        """Performing cross validation hold out predictions for stacking"""

        # Initilize
        n_classes = len(
            np.unique(y))  # Assuming that training data contains all classes
        meta_features = np.zeros((X.shape[0], n_classes))
        n_splits = cv.get_n_splits(X, y)

        # Loop over folds
        print("Starting hold out prediction with {} splits for {}.".format(
            n_splits, clf.__class__.__name__))
        cnt = 0
        for train_idx, hold_out_idx in cv.split(X, y):

            # Split data
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_hold_out = X[hold_out_idx]

            # Fit estimator to K-1 parts and predict on hold out part
            est = make_copy(clf)
            est.fit(X_train, y_train)
            y_hold_out_pred = est.predict_proba(X_hold_out)

            print("Loop nb " + str(cnt))
            cnt += 1

            # Fill in meta features
            meta_features[hold_out_idx] = y_hold_out_pred

        return meta_features
예제 #3
0
    def __init__(self, base_estimators=[Regressor(strategy="XGBoost"),
                                        Regressor(strategy="RandomForest"),
                                        Regressor(strategy="ExtraTrees")],
                 level_estimator=LinearRegression(), n_folds=5,
                 copy=False, random_state=1, verbose=True):

        self.base_estimators = base_estimators
        if(type(base_estimators) != list):
            raise ValueError("base_estimators must be a list")
        else:
            for i, est in enumerate(self.base_estimators):
                self.base_estimators[i] = make_copy(est)

        self.level_estimator = level_estimator

        self.n_folds = n_folds
        if(type(n_folds) != int):
            raise ValueError("n_folds must be an integer")

        self.copy = copy
        if(type(copy) != bool):
            raise ValueError("copy must be a boolean")

        self.random_state = random_state
        if((type(self.random_state) != int) and
           (self.random_state is not None)):
            raise ValueError("random_state must be either None or an integer")

        self.verbose = verbose
        if(type(self.verbose) != bool):
            raise ValueError("verbose must be a boolean")

        self.__fitOK = False
        self.__fittransformOK = False
예제 #4
0
    def __init__(self, base_estimators = [Classifier(strategy="XGBoost"),Classifier(strategy="RandomForest"),Classifier(strategy="ExtraTrees")], level_estimator = LogisticRegression(n_jobs=-1), n_folds = 5, copy = False, drop_first = True, random_state = 1, verbose = True):


        self.base_estimators = base_estimators
        if(type(self.base_estimators)!=list):
            raise ValueError("base_estimators must be a list")
        else:
            for i, est in enumerate(self.base_estimators):
                self.base_estimators[i] = make_copy(est)

        self.level_estimator = level_estimator

        self.n_folds = n_folds
        if(type(self.n_folds)!=int):
            raise ValueError("n_folds must be an integer")

        self.copy = copy
        if(type(self.copy)!=bool):
            raise ValueError("copy must be a boolean")

        self.drop_first = drop_first
        if(type(self.drop_first)!=bool):
            raise ValueError("drop_first must be a boolean")

        self.random_state = random_state
        if((type(self.random_state)!=int)&(self.random_state!=None)):
            raise ValueError("random_state must be either None or an integer")

        self.verbose = verbose
        if(type(self.verbose)!=bool):
            raise ValueError("verbose must be a boolean")

        self.__fitOK = False
        self.__fittransformOK = False
예제 #5
0
    def set_params(self, **params):

        self.__X_meta_test = None
        self.__X_meta_train = None
        self.__fittransformOK = False
        self.__transformOK = False

        if 'base_estimators' in params.keys():
            self.base_estimators = params['base_estimators']
            del params['base_estimators']
            if type(self.base_estimators) != list:
                raise ValueError("base_estimators must be a list.")
            for i, est in enumerate(self.base_estimators):
                if type(est) == tuple:
                    self.base_estimators[i] = est
                elif isinstance(est, Regressor) or isinstance(est, RegressorStacking):
                    self.base_estimators[i] = make_copy(est)
                else:
                   raise ValueError("Elements of base_estimators must be either Regressor, RegressorStacking or tuple.")
                   
        if 'base_cv' in params.keys():
            self.base_cv = params['base_cv']
            del params['base_cv']

        if 'base_scoring' in params.keys():
            self.base_scoring = params['base_scoring']
            del params['base_scoring']
            
        if 'base_copy_idx' in params.keys():
            self.base_copy_idx = params['base_copy_idx']
            del params['base_copy_idx']
            if type(self.base_copy_idx) != list and self.base_copy_idx is not None:
                raise ValueError("base_copy_idx must be either None or a list of integers.")
                
        if 'base_save' in params.keys():
            self.base_save = params['base_save']
            del params['base_save']
            if type(self.base_save) != bool:
                raise ValueError("base_save must be a boolean.")   
        
        if 'base_save_files' in params.keys():
            self.base_save_files = params['base_save_files']
            if type(self.base_save_files) != list and self.base_save_files is not None:
                raise ValueError("base_save_files must be either None or a list of tuples.")
            if self.base_save_files is not None:
                if len(self.base_save_file) != len(self.base_estimators):
                    raise ValueError("base_save_files must be the same size as base_estimators.")
                
        if 'stacking_verbose' in params.keys():
            self.stacking_verbose = params['stacking_verbose']
            del params['stacking_verbose']
            if type(self.stacking_verbose) != bool:
                raise ValueError("stacking_verbose must be a boolean.")
                
        super(RegressorStacking, self).set_params(**params)
예제 #6
0
    def update(self, other: 'GraphElement') -> 'GraphElement':
        """Update those properties defined by other graph element.

        Undefined properties of the other graph element are not copied.
        Note that this function makes only a shallow copies of property values.
        """
        assert type(self) is type(other)
        props = {}
        for name in other.defined():
            value = other.property_value(name)
            props[name] = make_copy(value)
        self.props = self.props._replace(**props)
        return self
예제 #7
0
 def feature_select(self, best_iteration, X_train, y_train, X_test, y_test):
     if "n_estimators" in self.model.get_params().keys():
         self.model.set_params(n_estimators=best_iteration)
     # Create selection model
     selection_model = make_copy(self.model)
     # Fit selection model
     self.model.fit(X_train, y_train)
     # Dump feature importance df
     feature_importance = pd.DataFrame(
         self.model.feature_importances_,
         columns=["gain_score"],
         index=X_train.columns,
     )
     feature_importance.to_excel("feature_importance.xlsx")
     # Feature selection loop
     thresholds = np.sort(self.model.feature_importances_)[
         np.nonzero(np.sort(self.model.feature_importances_))
     ][::-1]
     print(
         "Test model performance on original dataset with n_estimators = {} \n".format(
             self.model.get_params()["n_estimators"]
         )
     )
     for thresh in thresholds:
         selection = SelectFromModel(self.model, threshold=thresh, prefit=True)
         select_X_train = selection.transform(X_train)
         # Model defining
         selection_model.fit(select_X_train, y_train)
         # Model evaluation
         select_X_test = selection.transform(X_test)
         y_pred_select = selection_model.predict_proba(select_X_test)[:, 1]
         auc_select = metrics.roc_auc_score(y_test, y_pred_select)
         accuracy_select = metrics.accuracy_score(
             y_test, selection_model.predict(select_X_test)
         )
         print(
             "Thresh={:.9f}, n={}, Accuracy:  {:.2f}%, AUC: {:.2f}%".format(
                 thresh,
                 select_X_train.shape[1],
                 accuracy_select * 100.0,
                 auc_select * 100.0,
             )
         )
     pass
    def data_merge(self, level_list, merge=True):
        print('Start getting interest data for training...')
        start = time.time()
        for level_dic in level_list:
            level_dic['data'], level_dic['fbid'] = self.get_interest(
                level_dic['link'])
        print(
            'Done getting interest data for training. Time taken = {:.1f}(s) \n'
            .format(time.time() - start))
        data_final = make_copy(level_list[0]['data'])
        dfs = [level_list[i]['data'] for i in range(1, 5)]
        if merge:
            for df in dfs:
                data_final = data_final.merge(df, on=['UID'], how='left')
                interest_strength = data_final.fillna(0)
        else:
            interest_strength = level_list[4]['data'].fillna(0)

        def level_convert(fbid):
            if fbid in set(level_list[4]['fbid']):
                return 5
            else:
                if fbid in set(level_list[3]['fbid']):
                    return 4
                else:
                    if fbid in set(level_list[2]['fbid']):
                        return 3
                    else:
                        if fbid in set(level_list[1]['fbid']):
                            return 2
                        else:
                            return 1

        interest_strength['INTEREST_LEVEL'] = interest_strength['UID'].map(
            level_convert)
        # Get ids
        sum_ids = interest_strength.sum(axis=0)
        sum_ids[sum_ids != 0]
        ids = sum_ids[sum_ids != 0].index
        return interest_strength, ids
예제 #9
0
    def __init__(self, modelname='Linear',
                 base_estimators=[Regressor(modelname="Linear"),
                                  Regressor(modelname="RandomForest"),
                                  Regressor(modelname="ExtraTrees")],
                 num_bagged_est=None, random_state=None,
                 base_cv=None, base_scoring = None, 
                 base_copy_idx=None, base_save=False, base_save_files=None,
                 stacking_verbose=True, **kwargs):
        
        """Construct a stacking regressor
        
        A stacking regressor is a regressor that uses the predictions of
        base layer estimators (generated with a cross validation method).
        
        Parameters
        ----------
        modelname : str, model name to be used as stacking regressor
            Available models:
            - "XGBoost", 
            - "LightGBM",
            - "Keras", 
            - "RandomForest", 
            - "ExtraTrees", 
            - "Tree", 
            - "Bagging", 
            - "AdaBoost" 
            - "Linear"
        num_bagged_est: int or None
            Number of estimators to be averaged after bagged fitting. 
            If None then bagged fitting is not performed. 
        random_state:  int, RandomState instance or None, optional, default=None
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator; 
            If None, the random number generator is the RandomState instance used by models. 
        base_estimators : list of estimators objects/tuples
            List of estimators to fit in the stacking level using a cross validation. 
            The items of list could be:
            - Regressor/RegressorStacking objects 
            - tuples with hold out and test predictions of base estimators
        stacking_estimator : object, default = Regressor(modenlame="Linear")
            The estimator used in stacking level.
        base_cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy used in hold out 
            predictions. Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        base_scoring : callable, default: None
            A callable to evaluate the predictions on the cv set in hold out predictions.
            None, accuracy score
        base_copy_idx : list, default = None
            The list of original features added to meta features
        base_save : bool, default = False
            Saves hold out and test predictions of each base estimator to pickle files
        base_save_files : list of tuples, default = None
            File refs of saved hold out and test predictions 
        random_state:  int, RandomState instance or None, optional, default=None
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator; 
            If None, the random number generator is the RandomState instance used by models. 
        stacking_verbose : bool, default = True
            Verbose mode.
        **kwargs : default = None
            Parameters of the corresponding stacking regressor.
            Examples : n_estimators, max_depth, ...
        """
    
        super(RegressorStacking, self).__init__(modelname, 
             num_bagged_est=num_bagged_est, random_state=random_state, **kwargs)
        
        self.base_estimators = base_estimators
        if type(self.base_estimators) != list:
            raise ValueError("base_estimators must be a list.")
        for i, est in enumerate(self.base_estimators):
            if type(est) == tuple:
                self.base_estimators[i] = est
            elif isinstance(est, Regressor) or isinstance(est, RegressorStacking):
                self.base_estimators[i] = make_copy(est)
            else:
               raise ValueError("Elements of base_estimators must be either Regressor, RegressorStacking or tuple.")
              
        self.base_cv = base_cv
        self.base_scoring = base_scoring
        
        self.base_copy_idx = base_copy_idx
        if type(self.base_copy_idx) != list and self.base_copy_idx is not None:
            raise ValueError("base_copy_idx must be either None or a list of integers.")

        self.base_save = base_save
        if type(self.base_save) != bool:
            raise ValueError("base_save must be a boolean.")

        self.base_save_files = base_save_files
        if type(self.base_save_files) != list and self.base_save_files is not None:
            raise ValueError("base_save_files must be either None or a list of tuples.")
            
        if self.base_save_files is not None:
            if len(self.base_save_file) != len(self.base_estimators):
                raise ValueError("base_save_files must be the same size as base_estimators.")

        self.stacking_verbose = stacking_verbose
        if type(self.stacking_verbose) != bool:
            raise ValueError("stacking_verbose must be a boolean.")

        self.__X_meta_test = None
        self.__X_meta_train = None
        self.__fittransformOK = False
        self.__transformOK = False
예제 #10
0
    def get_estimator_copy(self):

        return make_copy(self.__classifier)