Пример #1
0
def decision_tree_bagging(Xtrain, Xtest, ytrain, ytest, ensemble_size=60):
    # bagging
    accuracies = []
    ensemble_sizes = []

    for i in range(1, ensemble_size):
        bagging = BaggingClassifier(
            base_estimator=tree.DecisionTreeClassifier(),
            n_estimators=i,
            bootstrap=True,
            max_samples=1.0,
            max_features=1.0)

        bagging.fit(Xtrain, ytrain)

        ypred = bagging.predict(Xtest)
        accuracy = np.mean(ypred == ytest)

        ensemble_sizes.append(i)
        accuracies.append(accuracy)

    plt.plot(ensemble_sizes, accuracies)
    plt.xlabel('number of estimators')
    plt.ylabel('accuracy')
    plt.grid(True)
    plt.title('Decision tree (bagging)')
    plt.show()

    print('Highest accuracy of  bagging = %f' % np.max(accuracies))
Пример #2
0
class BaggingClassifierImpl():
    def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):
        self._hyperparams = {
            'base_estimator': make_sklearn_compat(base_estimator),
            'n_estimators': n_estimators,
            'max_samples': max_samples,
            'max_features': max_features,
            'bootstrap': bootstrap,
            'bootstrap_features': bootstrap_features,
            'oob_score': oob_score,
            'warm_start': warm_start,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Пример #3
0
            # 3.1. End ###################################################################################################

            # 3.2 Instanciando os classificadores  #########################################################################

            # 3.2.1. Bagging com DecisionTree ############################################################

            # numero do modelo na tabela
            num_model = 0

            # modelo
            bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                   max_samples=pct_trainamento[i],
                                   max_features=1.0,
                                   n_estimators=qtd_modelos)
            # treinando o modelo
            bg.fit(x_train, y_train)

            # computando a previsao
            pred = bg.predict(x_test)

            # printando os resultados
            acuracia, auc, f1measure, gmean = printar_resultados(
                y_test, pred,
                nome_datasets[h] + '-pct-' + str(pct_trainamento[i]) +
                '- Bagging com DecisionTree [' + str(j) + ']')

            # escrevendo os resultados obtidos
            tabela.Adicionar_Sheet_Linha(num_model, j,
                                         [acuracia, auc, f1measure, gmean])

            # 3.2.1. End ###################################################################################
Пример #4
0
     # 3.3. End ################################################################################################
 
     # 3.4. Instanciando os classificadores ##########################################################
         
     ########## instanciando o modelo Bagging+REP ###########################################
     # definindo o numero do modelo na tabela
     num_model = 0
     
     # intanciando o classificador
     ensemble = BaggingClassifier(base_estimator=Perceptron(), 
                                 max_samples=qtd_amostras, 
                                 max_features=1.0, 
                                 n_estimators = qtd_modelos)
         
     # treinando o modelo
     ensemble.fit(x_train, y_train)
         
     # realizando a poda 
     ensemble = REP(x_val, y_val, ensemble)
                 
     # computando a previsao
     pred = ensemble.predict(x_test)
                 
     # computando a diversidade do ensemble
     q_statistic = MedidasDiversidade('q', x_val, y_val, ensemble)
     double_fault = MedidasDiversidade('disagreement', x_val, y_val, ensemble)
         
     # printando os resultados
     qtd_modelos, acuracia, auc, f1measure, gmean = printar_resultados(y_test, pred, ensemble, nome_datasets[h]+'-Bagging-REP-'+validacao[k]+'['+str(j)+']')
     
     # escrevendo os resultados obtidos
Пример #5
0
class Arquitetura:
    def __init__(self, n_vizinhos):
        '''
        :n_vizinhos: quantidade de vizinhos mais proximos que serao utilizados para regiao de competencia
        '''

        self.n_vizinhos = n_vizinhos

    def kDN(self, x, y):
        '''
        Metodo para computar o grau de dificuldade de cada observacao em um conjunto de dados
        :param: x: padroes dos dados
        :param: y: respectivos rotulos
        :return: dificuldades: vetor com a probabilidade de cada instancia 
        '''

        # instanciando os vizinhos mais proximos
        nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1,
                                algorithm='ball_tree').fit(x)

        # variavel para salvar as probabilidades
        dificuldades = []

        # for para cada instancia do dataset
        for i in range(len(x)):

            # computando os vizinhos mais proximos para cada instancia
            _, indices = nbrs.kneighbors([x[i]])

            # verificando o rotulo dos vizinhos
            cont = 0
            for j in indices[0]:
                if (j != i and y[j] != y[i]):
                    cont += 1

            # computando a porcentagem
            dificuldades.append(cont / (self.n_vizinhos + 1))

        return dificuldades

    def neighbors(self, dsel, x_query):
        '''
        metodo para retornar apenas os indices dos vizinhos
        '''

        # instanciando os vizinhos mais proximos
        nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1,
                                algorithm='ball_tree').fit(dsel)

        # computando os vizinhos mais proximos para cada instancia
        _, indices = nbrs.kneighbors([x_query])

        return indices

    def hardInstances(self, x, y, limiar):
        '''
        Metodo para retornar um subconjunto de validacao apenas com as instacias faceis
        :param: x: padroes dos dados
        :param: y: respectivos rotulos
        :return: x_new, y_new: 
        '''

        # computando as dificulades para cada instancia
        dificuldades = self.kDN(x, y)

        # variaveis para salvar as novas instancias
        x_new = []
        y_new = []

        # salvando apenas as instancias faceis
        for i in range(len(dificuldades)):
            if (dificuldades[i] > limiar):
                x_new.append(x[i])
                y_new.append(y[i])

        return np.asarray(x_new), np.asarray(y_new)

    def neighborhoodDifficulty(self, dsel, x_query, H):
        '''
        metodo para calcular o grau de dificuldade da vizinhanca
        :dsel: dataset para pesquisar os vizinhos
        :x_query: instancia a ser pesquisada
        :H: dificuldade do dataset dsel
        '''

        # obtendo a vizinhanca do exemplo
        indices = self.neighbors(dsel, x_query)[0]

        # dificuldade da regiao
        dificuldades = [H[i] for i in indices]

        # media da dificuldadde da regiao
        return np.min(dificuldades)

    def defineThreshold(self, indices):
        '''
        Metodo para definir o threshold
        :indices: os indices das instancias que foram classificadas incorretamente
        '''

        # obtendo a vizinhanca do exemplo
        lista = []
        for i in indices:
            lista.append(
                self.neighborhoodDifficulty(self.x_train, self.x_train[i],
                                            self.H))

        return np.mean(lista)

    def fit(self, x, y):
        '''
        metodo para treinar a arquitetura de dois niveis
        :x: dados para treinamento
        :y: rotulo dos dados
        :dsel_x: padroes da janela de validacao
        :dsel_y: rotulos da janela de validacao
        '''

        # salvando os dados de trainamento
        self.x_train = x
        self.y_train = y

        # salvando as dificuldades das instancias
        self.H = self.kDN(x, y)

        # treinando o nivel 1 #########################################
        self.levelone = KNeighborsClassifier(self.n_vizinhos)
        self.levelone.fit(x, y)

        # realizando a previsao para o conjunto de treinamento
        y_pred = self.levelone.predict(x)

        # salvando os indices das instancias que foram classificadas erradas
        indices = [i for i in range(len(y)) if y_pred[i] != y[i]]

        # obtendo o limiar de dificuldade do problema
        self.limiar = self.defineThreshold(indices)
        ###############################################################

        # treinando o nivel 2 #########################################
        # obtendo as instancias dificeis
        x_dificeis, y_dificeis = self.hardInstances(x, y, self.limiar)

        # criando o ensemble
        self.ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                          max_samples=0.9,
                                          max_features=1.0,
                                          n_estimators=100)
        self.ensemble.fit(x_dificeis, y_dificeis)

        # treinando o modelo 2
        self.leveltwo = KNORAU(self.ensemble.estimators_, self.n_vizinhos)
        self.leveltwo.fit(x_dificeis, y_dificeis)
        # verificando se o ola acerta os exemplos errados pelo svm
        ###############################################################

    def predict_svm(self, x):
        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [
                self.levelone.predict(np.array([pattern]))[0] for pattern in x
            ]

        # to predict only one example
        else:
            return self.levelone.predict(np.array([x]))[0]

    def predict_ola(self, x):
        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [
                self.leveltwo.predict(np.array([pattern]))[0] for pattern in x
            ]

        # to predict only one example
        else:
            return self.leveltwo.predict(np.array([x]))[0]

    def predict_one(self, x):
        '''
        metodo para computar a previsao de um exemplo
        :x: padrao a ser predito
        '''

        # media da dificuldadde da regiao
        media = self.neighborhoodDifficulty(self.x_train, x, self.H)

        # verificando a dificuldade da instancia
        if (media >= self.limiar):
            return self.leveltwo.predict(np.array([x]))[0]
        else:
            return self.levelone.predict(np.array([x]))[0]

    def predict(self, x):
        '''
        metodo para computar a previsao de um exemplo
        :x: padrao a ser predito
        '''

        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [self.predict_one(pattern) for pattern in x]

        # to predict only one example
        else:
            return self.predict_one(x)
Пример #6
0
# from sklearn.ensemble import AdaBoostClassifier as Boost
from sklearn.ensemble.bagging import BaggingClassifier as Boost
from sklearn.naive_bayes import GaussianNB

from csxdata import CData

from SciProjects.grapes import path, indepsn

if __name__ == '__main__':

    data = CData(path,
                 indepsn,
                 feature="evjarat",
                 headers=1,
                 cross_val=0.2,
                 lower=True)
    data.transformation = "std"
    model = Boost(GaussianNB(), n_estimators=100)

    model.fit(data.learning, data.lindeps)
    preds = model.predict(data.testing)
    eq = [left == right for left, right in zip(preds, data.tindeps)]
    print("Acc:", sum(eq) / len(eq))
Пример #7
0
def main():
    
    # 1. Definindo variaveis para o experimento #########################################################################
    qtd_modelos = 100
    qtd_execucoes = 30
    qtd_amostras = 0.9
    qtd_folds = 10
    n_vizinhos = 7
    nome_datasets = ['kc1', 'kc2']
    # 1. End ############################################################################################################

    # for para variar entre os datasets
    for h in range(len(nome_datasets)):
    
        # 2. Lendo os datasets  ############################################################################################
        # lendo o dataset
        data = pd.read_csv('dataset/'+nome_datasets[h]+'.csv')
        
        # obtendo os padroes e seus respectivos rotulos
        df_x = np.asarray(data.iloc[:,0:-1])
        df_y = np.asarray(data.iloc[:,-1])
        
        
        # 2.1. Criando a tabela para salvar os dados  #################################################
        # criando a tabela que vai acomodar o modelo
        tabela = Tabela_excel()
        tabela.Criar_tabela(nome_tabela='arquivos_lista03/'+nome_datasets[h], 
                            folhas=['OLA', 'LCA', 'KNORA-E', 'KNORA-U', 'Arquitetura'], 
                            cabecalho=['acuracy', 'auc', 'fmeasure', 'gmean'], 
                            largura_col=5000)
        # 2.1. End #####################################################################################
        # 2. End ############################################################################################################
        
        # executando os algoritmos x vezes
        for j in range(qtd_execucoes):
            
            # 3. Dividindo os dados para treinamento e teste ################################################################
            # quebrando o dataset sem sobreposicao em 90% para treinamento e 10% para teste  
            skf = StratifiedKFold(df_y, n_folds=qtd_folds)
                
            # tomando os indices para treinamento e teste
            train_index, test_index = next(iter(skf))
                        
            # obtendo os conjuntos de dados para treinamento e teste
            x_train = df_x[train_index]
            y_train = df_y[train_index]
            x_test = df_x[test_index]
            y_test = df_y[test_index]
            # 3. End #########################################################################################################
            
            
            # 4. Gerando o pool de classificadores  ##########################################################################
            # intanciando o classificador
            ensemble = BaggingClassifier(base_estimator=Perceptron(), 
                                            max_samples=qtd_amostras, 
                                            max_features=1.0, 
                                            n_estimators = qtd_modelos)
                    
            # treinando o modelo
            ensemble.fit(x_train, y_train)
            # 4. End  ########################################################################################################
            
            # 5. Instanciando os classificadores ##########################################################
            
            ################################### OLA ########################################################
            executar_modelo('OLA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### LCA ########################################################
            executar_modelo('LCA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### KNORAE #####################################################
            executar_modelo('KNORAE', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### KNORAU #####################################################
            executar_modelo('KNORAU', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### Arquitetura ################################################
            # importando o metodo
            arq = Arquitetura(n_vizinhos)
            # treinando o metodo
            arq.fit(x_train, y_train)
            # realizando a previsao
            pred = arq.predict(x_test)
            # printando os resultados
            nome = 'Arquitetura'
            acuracia, auc, f1measure, gmean = printar_resultados(y_test, pred, nome_datasets[h]+'-'+nome+'-['+str(j)+']')
            # escrevendo os resultados obtidos
            tabela.Adicionar_Sheet_Linha(4, j, [acuracia, auc, f1measure, gmean])
Пример #8
0
from sklearn.ensemble.bagging import BaggingClassifier

train = pd.read_csv("train.csv")

train.drop(['Cabin'], 1, inplace=True)

train = train.dropna()
y = train['Survived']
train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True)
train.fillna({'Age': 30})
X = pd.get_dummies(train)

bag_clf = BaggingClassifier(
    tree.DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=200,
    bootstrap=True,  # True => bagging, False => pasting
    n_jobs=-1  # use all cores
)

bag_clf.fit(X, y)

test = pd.read_csv('test.csv')
ids = test[['PassengerId']]
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True)
test.fillna(2, inplace=True)
test = pd.get_dummies(test)
predictions = bag_clf.predict(test)
results = ids.assign(Survived=predictions)
results.to_csv('titanic_result_bagging.csv', index=False)
Пример #9
0
                plt.xlabel('Relative Importance')
                plt.title('Variable Importance based on bagging method')
                plt.show()
            except:
                print('不展示特征重要性')

    if 0:
        print(' '.join(['*' * 25, 'RandomForestClassifier', '*' * 25, '\n']))
        from sklearn.ensemble.bagging import BaggingClassifier
        clf_svm0=SVC(C=10,kernel='rbf',gamma=0.1,probability=True,\
                decision_function_shape='ovr',random_state=seed,class_weight='balanced')
        pipe_svm0 = Pipeline([('scaler', Scaler()), ('clf', clf_svm0)])
        clf_bg = BaggingClassifier(base_estimator=pipe_svm0, n_estimators=10, max_samples=1.0, \
                                   max_features=1.0, random_state=seed)
        start = time.time()
        clf_bg = clf_bg.fit(X_train, Y_train)
        print('Total running time is {}s'.format(time.time() - start))
        judge = cross_val_score(clf_bg,
                                X,
                                Y,
                                groups=None,
                                scoring=Evaluate(score_func),
                                cv=5)
        #print('Cross-validation score is {}'.format(judge))
        print('Mean cross-validation score is {}'.format(judge.mean()))

    # Boosting method:GradBoost
    if 1:
        print(' '.join(
            ['*' * 25, 'GradientBoostingClassifier', '*' * 25, '\n']))
        from sklearn.ensemble import GradientBoostingClassifier