Exemplo n.º 1
0
def bagging_metaestimator(X, Y, vrbl_names, n_estimators, p_smpl, p_feat, n_jobs, base_estim):

    from sklearn.model_selection import KFold, RepeatedKFold
    from sklearn.ensemble import BaggingRegressor

    cv = KFold(n_splits=5, shuffle=True)


    try: X = X.values
    except: pass
    try: Y = Y.values
    except: pass
    

    max_feats = np.max([int(X.shape[1]*p_feat), 1])
    max_n_estim = n_estimators*5

    fitted_ensemble = BaggingRegressor(
                    base_estimator=base_estim,
                    n_estimators=max_n_estim,   # Number of fittings
                    max_samples=p_smpl,         # Select e.g. 50% of training data per random sample
                    max_features=max_feats,     # Select e.g. N/3 variables randomly
                    bootstrap=False,   # 
                    bootstrap_features=False,
                    oob_score=False,
                    n_jobs=n_jobs,    #8,
                    random_state=70,
                    verbose=1).fit(X, Y) 
    

    final_ensemble_idx = np.zeros(max_n_estim,dtype=bool)
    for i, estimator in enumerate(fitted_ensemble.estimators_):
        true_indices = np.abs(estimator.coef_)>0
        
        # Definition of success in fitting: at least one predictor
        # needs to be found
        if(true_indices.sum() > 0):
            final_ensemble_idx[i] = True
    
    fitted_ensemble.estimators_features_ = list(np.array(fitted_ensemble.estimators_features_)[final_ensemble_idx])
    fitted_ensemble.estimators_features_ = fitted_ensemble.estimators_features_[0:n_estimators]
    fitted_ensemble.estimators_ = list(np.array(fitted_ensemble.estimators_)[final_ensemble_idx])
    fitted_ensemble.estimators_ = fitted_ensemble.estimators_[0:n_estimators]
    
    return fitted_ensemble
    def bagging_test(self, dataset, target):
        bagging_workflow_ranks = []
        if self._validateDataset(dataset, target):
            for f in dataset.columns:
                if dataset[f].dtype == 'object':
                    dataset = dataset.drop(columns=f, axis=1)

            meta_features_estematic = self._metafeatures(
                dataset, target, self.meta_functions,
                self.post_processing_steps)
            simpleImputer = SimpleImputer()
            X = simpleImputer.fit_transform(dataset.drop(target, axis=1))
            y = dataset[target]

            for params in self.bagging_grid:  # Combinações de Parametros
                for DS in self.DStechique:
                    for pruning in self.pruning:
                        for base_estimator in self.base_estimators:  # Combinação dos algoritmos base
                            # Skip Useless Combinations
                            if (self._skipCombination(pruning)):
                                continue

                            t = time.time()
                            # Cross Validation 4 Folds
                            Ranks = []
                            kf = KFold(n_splits=4)
                            for train_index, test_index in kf.split(X):
                                # Slit the dataset of the current Fold
                                X_train, X_test = X[train_index], X[test_index]
                                y_train, y_test = y[train_index], y[test_index]
                                y_train = y_train.reset_index(drop=True)
                                y_test = y_test.reset_index(drop=True)

                                # Create BaggingClassifier Model
                                bagging_workflow = BaggingRegressor(
                                    base_estimator=self.
                                    base_estimators[base_estimator],
                                    random_state=0,
                                    n_jobs=-1,
                                    **params)
                                # Apply Learning Algorithm
                                bagging_workflow.fit(X_train, y_train)

                                predictions = []
                                # PRUNING METHODS
                                if pruning['pruning_method'] == 1 and pruning[
                                        'pruning_cp'] != 0:
                                    # RE Method
                                    # Create predicts for each base-estimator
                                    for estimator, features in zip(
                                            bagging_workflow.estimators_,
                                            bagging_workflow.
                                            estimators_features_):
                                        predictions.append(
                                            estimator.predict(
                                                X_train[:, features]))
                                    # Calculate the index of the base-estimators that will remain in the bagging
                                    re_index = self._re(
                                        y_train, predictions, X_train,
                                        pruning['pruning_cp'])
                                    # The actual prunning of the bagging
                                    estimators = []
                                    for i in re_index.values():
                                        estimators.append(
                                            bagging_workflow.estimators_[i])
                                    bagging_workflow.estimators_ = estimators
                                else:
                                    # No pruning method
                                    pruning['pruning_cp'] = 0

                                # Dynamic Select
                                if DS['ds'] == -1:
                                    # KNORAE
                                    bagging_workflow = KNORAE(bagging_workflow,
                                                              random_state=0)
                                    bagging_workflow.fit(X_train, y_train)
                                else:
                                    if DS['ds'] == 1:
                                        # OLA
                                        bagging_workflow = OLA(
                                            bagging_workflow, random_state=0)
                                        bagging_workflow.fit(X_train, y_train)

                                Rank_fold = mean_squared_error(
                                    bagging_workflow.predict(X_test), y_test)
                                # Save the current bagging rank
                                Ranks.append(Rank_fold)

                            # y_meta is the Meta Target Array that contains the score of the bagging workflow
                            sys.stdout.write('\r' + "Elapsed: %.2f seconds\n" %
                                             (time.time() - t))
                            bagging_workflow_ranks.append(float(mean(Ranks)))
        return bagging_workflow_ranks
    def fit(
        self,
        datasets,  # Lista com datasets
        target_names):  # Nome dos targets de todas os datasets

        # Por cada file abrir o csv e tirar para um array de DataFrames
        x_meta = [
        ]  # Vai conter todas as Meta-features, uma linha um exemplo de um algoritmo com um certo tipo de parametros
        y_meta = [
        ]  # Vai conter o Meta-Target, em cada linha têm a avaliação de 1-n de cada algoritmo
        # + parametros do bagging workflow
        ndataset = 0
        t = time.time()
        for dataset, target in zip(
                datasets, target_names
        ):  # Percorre todos os datasets para treino do meta-model
            if self._validateDataset(dataset, target):
                ndataset = ndataset + 1
                if not self.silence:
                    print(
                        "________________________________________________________________________"
                    )
                    print("Dataset nº ", ndataset)
                    print("Shape: {}(examples, features)".format(
                        np.shape(dataset)))
                # Number of Bagging Workflows
                indexBagging = 1
                indexMaxBagging = 0
                for params in self.bagging_grid:  # Combinações de Parametros
                    for DS in self.DStechique:  # Combinações do Dynamic Selection
                        for pruning in self.pruning:  # Combinações dos Pruning Methods
                            for base_estimator in self.base_estimators:  # Combinação dos algoritmos base
                                # Skip Useless Combinations
                                if (self._skipCombination(pruning)):
                                    continue
                                indexMaxBagging = indexMaxBagging + 1
                # Time
                t = time.time()
                # Drop Categorial features, DecisionTree do sklearn não aceitam
                for f in dataset.columns:
                    if dataset[f].dtype == 'object':
                        if type(dataset[f]) != pd.core.series.Series:
                            dataset = dataset.drop(columns=f, axis=1)
                        else:
                            dataset[f] = pd.to_numeric(dataset[f],
                                                       errors='coerce')

                # MetaFeatures
                meta_features_estematic = self._metafeatures(
                    dataset, target, self.meta_functions,
                    self.post_processing_steps)

                # Convert +/- Inf to NaN
                dataset.replace(np.inf, np.nan)
                dataset.replace(-np.inf, np.nan)
                # Drop Columns with all NaN values
                dataset = dataset.dropna(axis=1, how='all')
                # Drop examples with some Nan Values
                dataset = dataset.dropna(axis=0, how='any')
                dataset = dataset.reset_index(drop=True)

                # Dividir o dataset em exemplos e os targets
                simpleImputer = SimpleImputer()
                X = simpleImputer.fit_transform(dataset.drop(target, axis=1))
                y = dataset[target]

                # Criar base-models
                for params in self.bagging_grid:  # Combinações de Parametros
                    for DS in self.DStechique:
                        for pruning in self.pruning:
                            for base_estimator in self.base_estimators:  # Combinação dos algoritmos base
                                # Skip Useless Combinations
                                if (self._skipCombination(pruning)):
                                    continue
                                sys.stdout.write(
                                    '\r' +
                                    "Creating Baggings Workflows... [{}/{}]".
                                    format(indexBagging, indexMaxBagging))
                                meta_features = meta_features_estematic.copy(
                                )  # Meta-features do dataset só é criado uma vez

                                # Cross Validation 4 Folds
                                Ranks = []
                                kf = KFold(n_splits=4)
                                for train_index, test_index in kf.split(X):
                                    # Separar set do Fold atual
                                    X_train, X_test = X[train_index], X[
                                        test_index]
                                    y_train, y_test = y[train_index], y[
                                        test_index]
                                    y_train = y_train.reset_index(drop=True)
                                    y_test = y_test.reset_index(drop=True)

                                    # Criar modelo
                                    bagging_workflow = BaggingRegressor(
                                        base_estimator=self.
                                        base_estimators[base_estimator],
                                        random_state=0,
                                        n_jobs=-1,
                                        **params)
                                    # Treinar o modelo
                                    bagging_workflow.fit(X_train, y_train)

                                    predictions = []
                                    # PRUNING METHODS
                                    if pruning[
                                            'pruning_method'] == 1 and pruning[
                                                'pruning_cp'] != 0:
                                        # Criar predicts para todos os base-model
                                        for estimator, features in zip(
                                                bagging_workflow.estimators_,
                                                bagging_workflow.
                                                estimators_features_):
                                            predictions.append(
                                                estimator.predict(
                                                    X_train[:, features]))
                                        re_index = self._re(
                                            y_train, predictions, X_train,
                                            pruning['pruning_cp'])
                                        # Pruning the bagging_workflow
                                        estimators = []
                                        for i in re_index.values():
                                            estimators.append(bagging_workflow.
                                                              estimators_[i])
                                        bagging_workflow.estimators_ = estimators
                                    else:
                                        pruning['pruning_cp'] = 0
                                    # Dynamic Select
                                    if DS['ds'] == -1:
                                        bagging_workflow = KNORAE(
                                            bagging_workflow, random_state=0)
                                        bagging_workflow.fit(X_train, y_train)
                                        Rank_fold = mean_squared_error(
                                            bagging_workflow.predict(X_test),
                                            y_test)

                                    else:
                                        if DS['ds'] == 1:
                                            bagging_workflow = OLA(
                                                bagging_workflow,
                                                random_state=0)
                                            bagging_workflow.fit(
                                                X_train, y_train)
                                            Rank_fold = mean_squared_error(
                                                bagging_workflow.predict(
                                                    X_test), y_test)
                                        else:
                                            # Criar landmark do baggingworkflow atual
                                            Rank_fold = mean_squared_error(
                                                bagging_workflow.predict(
                                                    X_test), y_test)
                                    Ranks.append(Rank_fold)

                                # Adicionar ao array de metafeatures, as caracteriticas dos baggings workflows
                                meta_features['n_estimators'] = params[
                                    'n_estimators']
                                meta_features['pruning_method'] = pruning[
                                    'pruning_method']
                                meta_features['pruning_cp'] = pruning[
                                    'pruning_cp']
                                meta_features['ds'] = DS['ds']
                                meta_features[
                                    'Algorithm'] = self.estimators_switcher[
                                        base_estimator]
                                # Este array é o meta target do score do algoritmo
                                y_meta.append(float(mean(Ranks)))

                                # Este array contem as várias metafeatures do dataset e o scores do algoritmo base/parametros a testar
                                x_meta.append(meta_features)
                                indexBagging = indexBagging + 1
                sys.stdout.write(
                    '\r' + "                                                ")
                sys.stdout.write('\r' + "Elapsed: %.2f seconds\n" %
                                 (time.time() - t))
                # Backup Data
                pd.DataFrame(x_meta).to_csv(
                    "./metadata/Meta_Data_Regressor_backup.csv")
                pd.DataFrame(y_meta).to_csv(
                    "./metadata/Meta_Target_Regressor_backup.csv")
        if not self.silence:
            print(
                "________________________________________________________________________"
            )
        # Meta Data é a junção de todas as metafeatures com os scores dos respeticos algoritmos base
        self.meta_data = pd.DataFrame(x_meta)
        self.meta_target = np.array(y_meta)
        # Guardar Meta Data num ficheiro .CSV
        self.meta_data.to_csv('./metadata/Meta_Data_Regressor.csv')
        pd.DataFrame(
            self.meta_target).to_csv('./metadata/Meta_Target_Regressor.csv')
        if not self.silence:
            print("Meta-Data Created and Saved.")
        # Tratar dos dados para entrar no XGBOOST
        for f in self.meta_data.columns:
            if self.meta_data[f].dtype == 'object':
                lbl = LabelEncoder()
                lbl.fit(list(self.meta_data[f].values))
                self.meta_data[f] = lbl.transform(
                    list(self.meta_data[f].values))

        self.meta_data.fillna((-999), inplace=True)
        self.meta_data = np.array(self.meta_data)
        self.meta_data = self.meta_data.astype(float)
        if not self.silence:
            print("Constructing Meta-Model:")
        # Criar o Meta Model XGBOOST
        self.meta_model = xgb.XGBRegressor(objective="reg:squarederror",
                                           colsample_bytree=0.3,
                                           learning_rate=0.1,
                                           max_depth=6,
                                           alpha=1,
                                           n_estimators=100,
                                           n_jobs=-1)

        # Aplicar Learning algorithm
        self.meta_model.fit(self.meta_data, self.meta_target)
        self.is_fitted = True
        return self
    def predict(self, dataset, target):
        if self._validateDataset(dataset, target):
            for f in dataset.columns:
                if dataset[f].dtype == 'object':
                    dataset = dataset.drop(columns=f, axis=1)

            meta_features_estematic = self._metafeatures(
                dataset, target, self.meta_functions,
                self.post_processing_steps)
            simpleImputer = SimpleImputer()
            X = simpleImputer.fit_transform(dataset.drop(target, axis=1))
            y = dataset[target]
            BestScore = -1
            score = 0
            RecommendedBagging = {}
            bagging_combination = []
            for params in self.bagging_grid:  # Combinações de Parametros
                for DS in self.DStechique:
                    for pruning in self.pruning:
                        for base_estimator in self.base_estimators:  # Combinação dos algoritmos base
                            meta_features = meta_features_estematic.copy()
                            meta_features['n_estimators'] = params[
                                'n_estimators']
                            meta_features['pruning_method'] = pruning[
                                'pruning_method']
                            meta_features['pruning_cp'] = pruning['pruning_cp']
                            meta_features['ds'] = DS['ds']
                            meta_features[
                                'Algorithm'] = self.estimators_switcher[
                                    base_estimator]
                            meta_features_dic = meta_features
                            features = []
                            features.append(meta_features)
                            meta_features = pd.DataFrame(features)
                            meta_features = meta_features[[
                                'Features.Entropy.Mean',
                                'Features.Entropy.StandardDeviation',
                                'Features.Entropy.Skew',
                                'Features.Entropy.Kurtosis',
                                'Features.MutualInformation.Mean',
                                'Features.MutualInformation.StandardDeviation',
                                'Features.MutualInformation.Skew',
                                'Features.MutualInformation.Kurtosis',
                                'Features.SpearmanCorrelation.Mean',
                                'Features.SpearmanCorrelation.StandardDeviation',
                                'Features.SpearmanCorrelation.Skew',
                                'Features.SpearmanCorrelation.Kurtosis',
                                'FeaturesLabels.SpearmanCorrelation.Mean',
                                'FeaturesLabels.SpearmanCorrelation.StandardDeviation',
                                'FeaturesLabels.SpearmanCorrelation.Skew',
                                'FeaturesLabels.SpearmanCorrelation.Kurtosis',
                                'Features.Mean.Mean',
                                'Features.Mean.StandardDeviation',
                                'Features.Mean.Skew', 'Features.Mean.Kurtosis',
                                'Features.StandardDeviation.Mean',
                                'Features.StandardDeviation.StandardDeviation',
                                'Features.StandardDeviation.Skew',
                                'Features.StandardDeviation.Kurtosis',
                                'Features.Skew.Mean',
                                'Features.Skew.StandardDeviation',
                                'Features.Skew.Skew', 'Features.Skew.Kurtosis',
                                'Features.Kurtosis.Mean',
                                'Features.Kurtosis.StandardDeviation',
                                'Features.Kurtosis.Skew',
                                'Features.Kurtosis.Kurtosis',
                                'FeaturesLabels.MutualInformation.Mean',
                                'FeaturesLabels.MutualInformation.StandardDeviation',
                                'FeaturesLabels.MutualInformation.Skew',
                                'FeaturesLabels.MutualInformation.Kurtosis',
                                'Number of Examples', 'Number of Features',
                                'n_estimators', 'pruning_method', 'pruning_cp',
                                'ds', 'Algorithm'
                            ]]
                            meta_features.replace([np.inf, -np.inf],
                                                  np.nan,
                                                  inplace=True)
                            meta_features.fillna((-999), inplace=True)
                            bagging_combination.append(
                                np.array(meta_features.copy()))
            bagging_combination = np.squeeze(np.array(bagging_combination))
            scores = self.meta_model.predict(bagging_combination)
            BestScore = (np.abs(scores)).argmin()
            RecommendedBagging = bagging_combination[BestScore]
            # Prints e construção do Bagging previsto
            npSize = RecommendedBagging.size
            n_estimators = int(RecommendedBagging[npSize - 5])
            pruning_method = int(RecommendedBagging[npSize - 4])
            pruning_cp = float(RecommendedBagging[npSize - 3] / 100)
            ds = int(RecommendedBagging[npSize - 2])
            base_estimator = int(RecommendedBagging[npSize - 1])

            # String para visualização
            if pruning_method == 1:
                pruning_method_str = 'RE'
            else:
                pruning_method_str = 'None'
            if ds > 0.5:
                ds_str = 'KNORAE'
            else:
                if ds < -0.5:
                    ds_str = 'OLA'
                else:
                    ds_str = 'None'
            if not self.silence:
                print("Recommended Bagging workflow: ")
                print("\tNumber of models: ", n_estimators)
                if pruning_method != 0:
                    print("\tPruning Method: ", pruning_method_str)
                    print("\tPruning CutPoint: ", pruning_cp * 100)
                else:
                    print("\tPruning: ", pruning_method_str)
                print("\tDynamic Selection: ", ds_str)
                print("\tAlgorithm: ", base_estimator)

            # BaggingWorkflow
            bagging_workflow = BaggingRegressor(
                base_estimator=self.base_estimators[
                    self.estimators_switcher2[base_estimator]],
                n_estimators=n_estimators,
                random_state=0,
                n_jobs=-1)

            # Dividir o dataset em exemplos e os targets
            X = SimpleImputer().fit_transform(dataset.drop(target, axis=1))
            y = dataset[target]
            X_train = X
            y_train = y
            # Treinar o modelo
            bagging_workflow.fit(X_train, y_train)
            predictions = []
            if pruning_method == 1 and pruning_cp != 0:
                if not self.silence:
                    print("Waiting for RE")
                for estimator, features in zip(
                        bagging_workflow.estimators_,
                        bagging_workflow.estimators_features_):
                    predictions.append(estimator.predict(X_train[:, features]))
                re_index = self._re(y_train, predictions, X_train, pruning_cp)
                # Pruning the bagging_workflow
                estimators = []
                for i in re_index.values():
                    estimators.append(bagging_workflow.estimators_[i])
                bagging_workflow.estimators_ = estimators
            # Dynamic Select
            if ds == -1:
                bagging_workflow = KNORAE(bagging_workflow, random_state=0)
                bagging_workflow.fit(X_train, y_train)
            else:
                if ds == 1:
                    bagging_workflow = OLA(bagging_workflow, random_state=0)
                    bagging_workflow.fit(X_train, y_train)

            return bagging_workflow
        else:
            print("Erro, não é um problema de Regressão")