예제 #1
0
def decision_tree_bagging(Xtrain, Xtest, ytrain, ytest, ensemble_size=60):
    # bagging
    accuracies = []
    ensemble_sizes = []

    for i in range(1, ensemble_size):
        bagging = BaggingClassifier(
            base_estimator=tree.DecisionTreeClassifier(),
            n_estimators=i,
            bootstrap=True,
            max_samples=1.0,
            max_features=1.0)

        bagging.fit(Xtrain, ytrain)

        ypred = bagging.predict(Xtest)
        accuracy = np.mean(ypred == ytest)

        ensemble_sizes.append(i)
        accuracies.append(accuracy)

    plt.plot(ensemble_sizes, accuracies)
    plt.xlabel('number of estimators')
    plt.ylabel('accuracy')
    plt.grid(True)
    plt.title('Decision tree (bagging)')
    plt.show()

    print('Highest accuracy of  bagging = %f' % np.max(accuracies))
예제 #2
0
 def __init__(self,
              base_estimator=None,
              n_estimators=10,
              max_samples=1.0,
              max_features=1.0,
              bootstrap=True,
              bootstrap_features=False,
              oob_score=False,
              warm_start=False,
              n_jobs=None,
              random_state=None,
              verbose=0):
     self._hyperparams = {
         'base_estimator': make_sklearn_compat(base_estimator),
         'n_estimators': n_estimators,
         'max_samples': max_samples,
         'max_features': max_features,
         'bootstrap': bootstrap,
         'bootstrap_features': bootstrap_features,
         'oob_score': oob_score,
         'warm_start': warm_start,
         'n_jobs': n_jobs,
         'random_state': random_state,
         'verbose': verbose
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
예제 #3
0
def bagging_cv(X_train, y_train, seed, verbose=3):

    # Results:
    #                       DEFAULT      Z-SCORE      OUTLIERS
    # n_estimators          250          150          150
    # warm_start            True         True         True
    # max_samples           0.6          0.6          0.6
    # --------------------------------------------------------
    # f1-micro              0.9220       0.9268       0.9403

    clf = BaggingClassifier(n_estimators=140, random_state=seed)

    params = {
        'n_estimators': list(range(100, 1500, 50)),
        'warm_start': [True, False],
        'max_samples': [0.6, 0.8, 1.0]
    }

    gCV = GridSearchCV(estimator=clf,
                       param_grid=params,
                       scoring='f1_micro',
                       n_jobs=-1,
                       refit=True,
                       cv=3,
                       verbose=verbose,
                       return_train_score='warn')

    return gCV.fit(X_train.values, y_train)
예제 #4
0
class BaggingClassifierImpl():
    def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):
        self._hyperparams = {
            'base_estimator': make_sklearn_compat(base_estimator),
            'n_estimators': n_estimators,
            'max_samples': max_samples,
            'max_features': max_features,
            'bootstrap': bootstrap,
            'bootstrap_features': bootstrap_features,
            'oob_score': oob_score,
            'warm_start': warm_start,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
예제 #5
0
    def fit(self, x, y):
        '''
        metodo para treinar a arquitetura de dois niveis
        :x: dados para treinamento
        :y: rotulo dos dados
        :dsel_x: padroes da janela de validacao
        :dsel_y: rotulos da janela de validacao
        '''

        # salvando os dados de trainamento
        self.x_train = x
        self.y_train = y

        # salvando as dificuldades das instancias
        self.H = self.kDN(x, y)

        # treinando o nivel 1 #########################################
        self.levelone = KNeighborsClassifier(self.n_vizinhos)
        self.levelone.fit(x, y)

        # realizando a previsao para o conjunto de treinamento
        y_pred = self.levelone.predict(x)

        # salvando os indices das instancias que foram classificadas erradas
        indices = [i for i in range(len(y)) if y_pred[i] != y[i]]

        # obtendo o limiar de dificuldade do problema
        self.limiar = self.defineThreshold(indices)
        ###############################################################

        # treinando o nivel 2 #########################################
        # obtendo as instancias dificeis
        x_dificeis, y_dificeis = self.hardInstances(x, y, self.limiar)

        # criando o ensemble
        self.ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                          max_samples=0.9,
                                          max_features=1.0,
                                          n_estimators=100)
        self.ensemble.fit(x_dificeis, y_dificeis)

        # treinando o modelo 2
        self.leveltwo = KNORAU(self.ensemble.estimators_, self.n_vizinhos)
        self.leveltwo.fit(x_dificeis, y_dificeis)
예제 #6
0
def ensemble():
    pipeline = Pipeline([
        ('count_vectorizer',
         CountVectorizer(binary=True,
                         ngram_range=(1, 2),
                         max_features=15000,
                         stop_words=stopwords)),
        (
            'clf',
            VotingClassifier(
                estimators=[
                    ('nb', BaggingClassifier(MultinomialNB(alpha=0.2))),
                    ('lr',
                     BaggingClassifier(
                         LogisticRegression(class_weight='balanced',
                                            C=10,
                                            n_jobs=2))),
                    # ('rf', RandomForestClassifier(n_estimators=200, max_features='log2', class_weight='balanced', n_jobs=2))
                ],
                n_jobs=2,
                voting='soft',
                weights=[1, 1]))
    ])
    train_report(pipeline)
예제 #7
0
def performClassification(dataset, split, symbol, output_dir, forecast_out):
    """
        Performing Classification on 
        Various algorithms
    """

    predicted_values = []

    features = dataset.columns[:-1]

    index = int(np.floor(dataset.shape[0] * split))
    train, test, test_forecast = dataset[:index], dataset[
        index:-forecast_out], dataset[-forecast_out:]
    #dataset_all, test_forecast = dataset[:-forecast_out], dataset[-forecast_out:]
    #test = dataset_all.sample(frac=0.025)
    #train = dataset_all.loc[~dataset_all.index.isin(test.index)]

    log.info('-' * 80)
    log.info('%s train set: %s, test set: %s', symbol, train.shape, test.shape)
    predicted_values.append(str(symbol))
    predicted_values.append(str(train.shape))
    predicted_values.append(str(test.shape))

    #train, test = getFeatures(train[features], \
    #    train[output], test[features], 16)

    out_params = (symbol, output_dir)

    output = dataset.columns[-1]

    classifiers = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        SVC(degree=100, C=10000),
        BaggingClassifier(),
        AdaBoostClassifier(),
        neighbors.KNeighborsClassifier(),
        GradientBoostingClassifier(n_estimators=100),
        #QDA(),
    ]

    for classifier in classifiers:
        model_name, forecast_set, accuracy = benchmark_classifier(classifier, \
            train, test, test_forecast, features, symbol, output, out_params)
        log.info('%s, %s, %s, %s', symbol, model_name, forecast_set, accuracy)
        predicted_values.append(str(round(forecast_set.ravel()[0], 3)))
        predicted_values.append(str(round(accuracy, 3)))

    return predicted_values
예제 #8
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
예제 #9
0
    def __init__(self):
        self.random_rate=33
        clf1=SVC(C=1.0,random_state=33)
        clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3)
        clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1)
        clf4=BaggingClassifier(n_estimators=40,random_state=101)
        clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33)
        clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33)

        clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1)


        base_model=[
            ['svc',clf1],
            ['xgbc',clf2],
            ['rfc',clf3],
            ['bgc',clf4],
            ['adbc',clf5],
            ['gdbc',clf6]
        ]

        self.base_models=base_model
        self.XGB=clf7
예제 #10
0
    tuned_parameters = [{'n_neighbors':[3, 5, 7],
                         'weights':['uniform', 'distance'],
                         'algorithm':['ball_tree', 'kd_tree', 'brute'],
                         'p':[1, 2, 3]
                     }]
    algo = KNeighborsClassifier()
    
elif choice=='g' or choice=='G':
    print("\n**********************************\n")
    print("  \t Bagging")
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'max_features':[1, 3, 9],
                         'max_samples':[1, 5, 9, 21],
                         'random_state':[1, 2, 3, 5]
                     }]
    algo = BaggingClassifier()
    
elif choice=='h' or choice=='H':
    print("\n**********************************\n")
    print("  \t Random Forest")
    tuned_parameters = [{'n_estimators':[5, 10, 100, 200],
                         'criterion':['gini', 'entropy'],
                         'max_features':['log2', 'sqrt'],
                         'max_depth':[10, 100]
                     }]
    algo = RandomForestClassifier()

elif choice=='i' or choice=='I':
    print("\n**********************************\n")
    print("  \t AdaBoost Classifier")
    tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names
예제 #12
0
     elif(validacao[k] == validacao[1]):
         x_val, y_val = validacaoInstanciasFaceis(x_train, y_train, n_vizinhos)
     elif(validacao[k] == validacao[2]):
         x_val, y_val = validacaoInstanciasDificeis(x_train, y_train, n_vizinhos)
 
     # 3.3. End ################################################################################################
 
     # 3.4. Instanciando os classificadores ##########################################################
         
     ########## instanciando o modelo Bagging+REP ###########################################
     # definindo o numero do modelo na tabela
     num_model = 0
     
     # intanciando o classificador
     ensemble = BaggingClassifier(base_estimator=Perceptron(), 
                                 max_samples=qtd_amostras, 
                                 max_features=1.0, 
                                 n_estimators = qtd_modelos)
         
     # treinando o modelo
     ensemble.fit(x_train, y_train)
         
     # realizando a poda 
     ensemble = REP(x_val, y_val, ensemble)
                 
     # computando a previsao
     pred = ensemble.predict(x_test)
                 
     # computando a diversidade do ensemble
     q_statistic = MedidasDiversidade('q', x_val, y_val, ensemble)
     double_fault = MedidasDiversidade('disagreement', x_val, y_val, ensemble)
         
예제 #13
0
# Normalization (L1 & L2):
# NOTE:  Change 'normtype' value to 'l1' / 'l2' to change normalization type:
normtype = 'l2'#'l1'


# model_selection is used for manually enabling the individual models.
# NOTE:  Setting boolean value, eanbles/disables model.
model_selection = {
    'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ),
    'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ),
    'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ),
    'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ),
    'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ),
    'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ),
    'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ),
    'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ),
    'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ),
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
    'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ),
    'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ),
    'QDA': (True, QuadraticDiscriminantAnalysis() ),
    'NaiveBayes': (True,  GaussianNB() ),
    'RadiusNeighborsClassifier': (True, RadiusNeighborsClassifier() ),
    'SGDClassifier': (True, SGDClassifier() ),
예제 #14
0
    classifier.fit(audit_X, audit_y)
    store_pkl(classifier, name + ".pkl")
    adjusted = DataFrame(classifier.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(classifier.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")


build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=5),
            "DecisionTreeAudit")
build_audit(
    BaggingClassifier(DecisionTreeClassifier(random_state=13,
                                             min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAudit")
build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5),
            "ExtraTreesAudit")
build_audit(
    GradientBoostingClassifier(random_state=13, loss="exponential", init=None),
    "GradientBoostingAudit")
build_audit(LinearDiscriminantAnalysis(solver="lsqr"),
            "LinearDiscriminantAnalysisAudit")
build_audit(LogisticRegressionCV(), "LogisticRegressionAudit")
build_audit(
    BaggingClassifier(LogisticRegression(),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "LogisticRegressionEnsembleAudit")
예제 #15
0
from sklearn.manifold.t_sne import TSNE
from sklearn.linear_model.theil_sen import TheilSenRegressor
from sklearn.mixture.dpgmm import VBGMM
from sklearn.feature_selection.variance_threshold import VarianceThreshold

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
			'ElasticNet':ElasticNet(),
예제 #16
0
                       verbose=0,
                       warm_start=False),
    'DecisionTreeClassifier':
    DecisionTreeClassifier(max_depth=9,
                           random_state=123,
                           splitter="best",
                           criterion="gini"),
    'KNeighborsClassifier':
    KNeighborsClassifier(algorithm='auto',
                         leaf_size=30,
                         metric='minkowski',
                         metric_params=None,
                         n_jobs=1,
                         n_neighbors=5,
                         p=2,
                         weights='uniform'),
    'RandomForestClassifier':
    RandomForestClassifier(n_estimators=100,
                           random_state=123,
                           max_depth=9,
                           criterion="gini"),
    'GaussianNB':
    GaussianNB(priors=None),
    'SVC':
    SVC(C=1.0, kernel='linear', probability=True, random_state=124),
    'MLPClassifier':
    MLPClassifier(alpha=1, max_iter=1000, random_state=124),
    'BaggingClassifier':
    BaggingClassifier(random_state=124)
}
예제 #17
0
    PowerTransformer(method='yeo-johnson'),
    # PowerTransformer(method='box-cox'),
    QuantileTransformer(output_distribution='normal'),
    QuantileTransformer(output_distribution='uniform'),
    Normalizer()
]

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10,
                      random_state=0).fit(features, target),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    HistGradientBoostingClassifier(),
    MLPClassifier(random_state=1, max_iter=300),
    OneVsOneClassifier(LinearSVC(random_state=0)),
    OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),
                         random_state=0)
]
print('Importacao OK')

# %%
# =================Looping here

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
예제 #18
0
def main():
    
    # 1. Definindo variaveis para o experimento #########################################################################
    qtd_modelos = 100
    qtd_execucoes = 30
    qtd_amostras = 0.9
    qtd_folds = 10
    n_vizinhos = 7
    nome_datasets = ['kc1', 'kc2']
    # 1. End ############################################################################################################

    # for para variar entre os datasets
    for h in range(len(nome_datasets)):
    
        # 2. Lendo os datasets  ############################################################################################
        # lendo o dataset
        data = pd.read_csv('dataset/'+nome_datasets[h]+'.csv')
        
        # obtendo os padroes e seus respectivos rotulos
        df_x = np.asarray(data.iloc[:,0:-1])
        df_y = np.asarray(data.iloc[:,-1])
        
        
        # 2.1. Criando a tabela para salvar os dados  #################################################
        # criando a tabela que vai acomodar o modelo
        tabela = Tabela_excel()
        tabela.Criar_tabela(nome_tabela='arquivos_lista03/'+nome_datasets[h], 
                            folhas=['OLA', 'LCA', 'KNORA-E', 'KNORA-U', 'Arquitetura'], 
                            cabecalho=['acuracy', 'auc', 'fmeasure', 'gmean'], 
                            largura_col=5000)
        # 2.1. End #####################################################################################
        # 2. End ############################################################################################################
        
        # executando os algoritmos x vezes
        for j in range(qtd_execucoes):
            
            # 3. Dividindo os dados para treinamento e teste ################################################################
            # quebrando o dataset sem sobreposicao em 90% para treinamento e 10% para teste  
            skf = StratifiedKFold(df_y, n_folds=qtd_folds)
                
            # tomando os indices para treinamento e teste
            train_index, test_index = next(iter(skf))
                        
            # obtendo os conjuntos de dados para treinamento e teste
            x_train = df_x[train_index]
            y_train = df_y[train_index]
            x_test = df_x[test_index]
            y_test = df_y[test_index]
            # 3. End #########################################################################################################
            
            
            # 4. Gerando o pool de classificadores  ##########################################################################
            # intanciando o classificador
            ensemble = BaggingClassifier(base_estimator=Perceptron(), 
                                            max_samples=qtd_amostras, 
                                            max_features=1.0, 
                                            n_estimators = qtd_modelos)
                    
            # treinando o modelo
            ensemble.fit(x_train, y_train)
            # 4. End  ########################################################################################################
            
            # 5. Instanciando os classificadores ##########################################################
            
            ################################### OLA ########################################################
            executar_modelo('OLA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### LCA ########################################################
            executar_modelo('LCA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### KNORAE #####################################################
            executar_modelo('KNORAE', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### KNORAU #####################################################
            executar_modelo('KNORAU', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela)
            ################################################################################################
            
            ################################### Arquitetura ################################################
            # importando o metodo
            arq = Arquitetura(n_vizinhos)
            # treinando o metodo
            arq.fit(x_train, y_train)
            # realizando a previsao
            pred = arq.predict(x_test)
            # printando os resultados
            nome = 'Arquitetura'
            acuracia, auc, f1measure, gmean = printar_resultados(y_test, pred, nome_datasets[h]+'-'+nome+'-['+str(j)+']')
            # escrevendo os resultados obtidos
            tabela.Adicionar_Sheet_Linha(4, j, [acuracia, auc, f1measure, gmean])
예제 #19
0
from sklearn.ensemble.bagging import BaggingClassifier

train = pd.read_csv("train.csv")

train.drop(['Cabin'], 1, inplace=True)

train = train.dropna()
y = train['Survived']
train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True)
train.fillna({'Age': 30})
X = pd.get_dummies(train)

bag_clf = BaggingClassifier(
    tree.DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=200,
    bootstrap=True,  # True => bagging, False => pasting
    n_jobs=-1  # use all cores
)

bag_clf.fit(X, y)

test = pd.read_csv('test.csv')
ids = test[['PassengerId']]
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True)
test.fillna(2, inplace=True)
test = pd.get_dummies(test)
predictions = bag_clf.predict(test)
results = ids.assign(Survived=predictions)
results.to_csv('titanic_result_bagging.csv', index=False)
예제 #20
0
    (SGDRegressor(), ['predict'], create_regression_problem_1()),
    (Lasso(), ['predict'], create_regression_problem_1()),
    (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
     ['predict', 'predict_proba'], create_weird_classification_problem_1()),
    (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
                  transformer_weights={
                      'earth': 1,
                      'earth2': 2
                  }), ['transform'], create_weird_classification_problem_1()),
    (RandomForestRegressor(), ['predict'], create_regression_problem_1()),
    (CalibratedClassifierCV(LogisticRegression(),
                            'isotonic'), ['predict_proba'],
     create_weird_classification_problem_1()),
    (AdaBoostRegressor(), ['predict'], create_regression_problem_1()),
    (BaggingRegressor(), ['predict'], create_regression_problem_1()),
    (BaggingClassifier(), ['predict_proba'],
     create_weird_classification_problem_1()),
    (GradientBoostingRegressor(verbose=True), ['predict'],
     create_regression_problem_1(m=100000, n=200)),
    (XGBRegressor(), ['predict'], create_regression_problem_for_xgb_1())
]


# Create tests for numpy_flat language
def create_case_numpy_flat(estimator, methods, fit_data, predict_data,
                           export_predict_data):
    def test_case(self):
        model = clone(estimator)
        model.fit(**fit_data)
        for method in methods:
            pred = getattr(model, method)(**predict_data)
예제 #21
0
    QuantileTransformer(output_distribution='normal'),
    QuantileTransformer(output_distribution='uniform'),
    Normalizer()
]

# %%

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10,
                      random_state=0).fit(X, y),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    HistGradientBoostingClassifier(),
    MLPClassifier(random_state=1, max_iter=300),
    OneVsOneClassifier(LinearSVC(random_state=0)),
    OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),
                         random_state=0)
]
print('Importacao OK')
#%%
count = 0
dict_test = {}
dict_all = {}
for i in range(len(scaler)):
    scaler_i = scaler[i]
    for j in range(len(classifier_test)):
예제 #22
0
pred_nb = gc_clf_nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, pred_nb)
precision_nb = precision_score(y_test, pred_nb, average='weighted')
f1_score_nb = f1_score(y_test, pred_nb, average='weighted')
recall_scaore_nb = recall_score(y_test, pred_nb, average='weighted')
print("####FOR NB######")
print("Accuracy: ", accuracy_nb)
print("Precision:", precision_nb)
print("F1 Score:", f1_score_nb)
print("Recall Score:", recall_scaore_nb)

# In[14]:

pipe_bag = Pipeline([
    ('vect', CountVectorizer()), ('tfdf', TfidfTransformer()),
    ('boost', BaggingClassifier(base_estimator=naive_bayes.MultinomialNB()))
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfdf__use_idf': (True, False),
}
gc_clf_bc = GridSearchCV(pipe_bag, parameters, n_jobs=1)
gc_clf_bc = gc_clf_bc.fit(X_train, y_train)
print(gc_clf_bc.best_score_)
print(gc_clf_bc.best_params_)

# In[15]:

pred_bc = gc_clf_bc.predict(X_test)
accuracy_bc = accuracy_score(y_test, pred_bc)
예제 #23
0
                                          n_estimators=10,
                                          max_features=1)))
if ",MLPC," in Functions:
    models.append(('MLPC', MLPClassifier(alpha=0.1)))
if ",ABC," in Functions:
    models.append(('ABC', AdaBoostClassifier()))
if ",GNB," in Functions:
    models.append(('GNB', GaussianNB()))
if ",QDA," in Functions:
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
if ",GBC," in Functions:
    models.append(('GBC', GradientBoostingClassifier()))
if ",ETC," in Functions:
    models.append(('ETC', ExtraTreeClassifier()))
if ",BC," in Functions:
    models.append(('BC', BaggingClassifier()))
if ",SGDC," in Functions:
    models.append(('SGDC', SGDClassifier()))
if ",RC," in Functions:
    models.append(('RC', RidgeClassifier()))
if ",PAC," in Functions:
    models.append(('PAC', PassiveAggressiveClassifier()))
if ",ETSC," in Functions:
    models.append(('ETSC', ExtraTreesClassifier()))
if ",BNB," in Functions:
    models.append(('BNB', BernoulliNB()))
if ",GM," in Functions:
    models.append(('GM', GaussianMixture()))

from sklearn.model_selection import KFold
from collections import Counter
예제 #24
0
class Arquitetura:
    def __init__(self, n_vizinhos):
        '''
        :n_vizinhos: quantidade de vizinhos mais proximos que serao utilizados para regiao de competencia
        '''

        self.n_vizinhos = n_vizinhos

    def kDN(self, x, y):
        '''
        Metodo para computar o grau de dificuldade de cada observacao em um conjunto de dados
        :param: x: padroes dos dados
        :param: y: respectivos rotulos
        :return: dificuldades: vetor com a probabilidade de cada instancia 
        '''

        # instanciando os vizinhos mais proximos
        nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1,
                                algorithm='ball_tree').fit(x)

        # variavel para salvar as probabilidades
        dificuldades = []

        # for para cada instancia do dataset
        for i in range(len(x)):

            # computando os vizinhos mais proximos para cada instancia
            _, indices = nbrs.kneighbors([x[i]])

            # verificando o rotulo dos vizinhos
            cont = 0
            for j in indices[0]:
                if (j != i and y[j] != y[i]):
                    cont += 1

            # computando a porcentagem
            dificuldades.append(cont / (self.n_vizinhos + 1))

        return dificuldades

    def neighbors(self, dsel, x_query):
        '''
        metodo para retornar apenas os indices dos vizinhos
        '''

        # instanciando os vizinhos mais proximos
        nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1,
                                algorithm='ball_tree').fit(dsel)

        # computando os vizinhos mais proximos para cada instancia
        _, indices = nbrs.kneighbors([x_query])

        return indices

    def hardInstances(self, x, y, limiar):
        '''
        Metodo para retornar um subconjunto de validacao apenas com as instacias faceis
        :param: x: padroes dos dados
        :param: y: respectivos rotulos
        :return: x_new, y_new: 
        '''

        # computando as dificulades para cada instancia
        dificuldades = self.kDN(x, y)

        # variaveis para salvar as novas instancias
        x_new = []
        y_new = []

        # salvando apenas as instancias faceis
        for i in range(len(dificuldades)):
            if (dificuldades[i] > limiar):
                x_new.append(x[i])
                y_new.append(y[i])

        return np.asarray(x_new), np.asarray(y_new)

    def neighborhoodDifficulty(self, dsel, x_query, H):
        '''
        metodo para calcular o grau de dificuldade da vizinhanca
        :dsel: dataset para pesquisar os vizinhos
        :x_query: instancia a ser pesquisada
        :H: dificuldade do dataset dsel
        '''

        # obtendo a vizinhanca do exemplo
        indices = self.neighbors(dsel, x_query)[0]

        # dificuldade da regiao
        dificuldades = [H[i] for i in indices]

        # media da dificuldadde da regiao
        return np.min(dificuldades)

    def defineThreshold(self, indices):
        '''
        Metodo para definir o threshold
        :indices: os indices das instancias que foram classificadas incorretamente
        '''

        # obtendo a vizinhanca do exemplo
        lista = []
        for i in indices:
            lista.append(
                self.neighborhoodDifficulty(self.x_train, self.x_train[i],
                                            self.H))

        return np.mean(lista)

    def fit(self, x, y):
        '''
        metodo para treinar a arquitetura de dois niveis
        :x: dados para treinamento
        :y: rotulo dos dados
        :dsel_x: padroes da janela de validacao
        :dsel_y: rotulos da janela de validacao
        '''

        # salvando os dados de trainamento
        self.x_train = x
        self.y_train = y

        # salvando as dificuldades das instancias
        self.H = self.kDN(x, y)

        # treinando o nivel 1 #########################################
        self.levelone = KNeighborsClassifier(self.n_vizinhos)
        self.levelone.fit(x, y)

        # realizando a previsao para o conjunto de treinamento
        y_pred = self.levelone.predict(x)

        # salvando os indices das instancias que foram classificadas erradas
        indices = [i for i in range(len(y)) if y_pred[i] != y[i]]

        # obtendo o limiar de dificuldade do problema
        self.limiar = self.defineThreshold(indices)
        ###############################################################

        # treinando o nivel 2 #########################################
        # obtendo as instancias dificeis
        x_dificeis, y_dificeis = self.hardInstances(x, y, self.limiar)

        # criando o ensemble
        self.ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                          max_samples=0.9,
                                          max_features=1.0,
                                          n_estimators=100)
        self.ensemble.fit(x_dificeis, y_dificeis)

        # treinando o modelo 2
        self.leveltwo = KNORAU(self.ensemble.estimators_, self.n_vizinhos)
        self.leveltwo.fit(x_dificeis, y_dificeis)
        # verificando se o ola acerta os exemplos errados pelo svm
        ###############################################################

    def predict_svm(self, x):
        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [
                self.levelone.predict(np.array([pattern]))[0] for pattern in x
            ]

        # to predict only one example
        else:
            return self.levelone.predict(np.array([x]))[0]

    def predict_ola(self, x):
        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [
                self.leveltwo.predict(np.array([pattern]))[0] for pattern in x
            ]

        # to predict only one example
        else:
            return self.leveltwo.predict(np.array([x]))[0]

    def predict_one(self, x):
        '''
        metodo para computar a previsao de um exemplo
        :x: padrao a ser predito
        '''

        # media da dificuldadde da regiao
        media = self.neighborhoodDifficulty(self.x_train, x, self.H)

        # verificando a dificuldade da instancia
        if (media >= self.limiar):
            return self.leveltwo.predict(np.array([x]))[0]
        else:
            return self.levelone.predict(np.array([x]))[0]

    def predict(self, x):
        '''
        metodo para computar a previsao de um exemplo
        :x: padrao a ser predito
        '''

        # to predict multiple examples
        if (len(x.shape) > 1):
            # returning all labels
            return [self.predict_one(pattern) for pattern in x]

        # to predict only one example
        else:
            return self.predict_one(x)
예제 #25
0
kernel_pca = KernelPCA(n_components=150)  # Costs huge amounts of ram
randomized_pca = RandomizedPCA(n_components=500)

# REGRESSORS
random_forest_regressor = RandomForestRegressor(n_estimators=256)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60)
support_vector_regressor = svm.SVR()

# CLASSIFIERS
support_vector_classifier = svm.SVC(probability=True, verbose=True)
linear_support_vector_classifier = svm.LinearSVC(dual=False)
nearest_neighbor_classifier = KNeighborsClassifier()
extra_trees_classifier = ExtraTreesClassifier(n_estimators=256)
bagging_classifier = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(n_estimators=200,
                                              max_features=4),
    max_features=0.5,
    n_jobs=2,
    verbose=1)
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=200,
                                                          max_features=4,
                                                          learning_rate=0.3,
                                                          verbose=0)
random_forest_classifier = RandomForestClassifier(n_estimators=2)
logistic_regression = LogisticRegression(C=0.5)
ridge_classifier = RidgeClassifier(alpha=0.1, solver='svd')
bayes = MultinomialNB()
sgd = SGDClassifier()
boundary_forest = BoundaryForestClassifier(num_trees=4)

# FEATURE UNION
feature_union = FeatureUnion(transformer_list=[('PCA', pca)])
예제 #26
0
x = dfbalanceado.iloc[:, 1:6]
y = dfbalanceado.iloc[:, 6:7]
#Data split with 80% dedicated to training and 20% to test.
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
#Feed new DataFrame with data that we need predict
datosFinal = pd.read_csv('data/nuevos_individuos_credito.csv',
                         delimiter=',',
                         decimal='.')
#Data split of data, specially X_Test
X_test = datosFinal.iloc[:, 1:6]
#Configure different level 1 classifier related with stacking methodology.
models = [
    BaggingClassifier(),
    SVC(),
    ExtraTreeClassifier(),
    KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100),
    XGBClassifier(random_state=0,
                  n_jobs=-1,
                  learning_rate=0.1,
                  n_estimators=100,
                  max_depth=3)
]
S_train, S_test = stacking(models,
                           X_train,
                           y_train.values.ravel(),
                           X_test,
                           regression=False,
예제 #27
0
# from sklearn.ensemble import AdaBoostClassifier as Boost
from sklearn.ensemble.bagging import BaggingClassifier as Boost
from sklearn.naive_bayes import GaussianNB

from csxdata import CData

from SciProjects.grapes import path, indepsn

if __name__ == '__main__':

    data = CData(path,
                 indepsn,
                 feature="evjarat",
                 headers=1,
                 cross_val=0.2,
                 lower=True)
    data.transformation = "std"
    model = Boost(GaussianNB(), n_estimators=100)

    model.fit(data.learning, data.lindeps)
    preds = model.predict(data.testing)
    eq = [left == right for left, right in zip(preds, data.tindeps)]
    print("Acc:", sum(eq) / len(eq))
예제 #28
0
            y_train = df_y[train_index]
            x_test = df_x[test_index]
            y_test = df_y[test_index]

            # 3.1. End ###################################################################################################

            # 3.2 Instanciando os classificadores  #########################################################################

            # 3.2.1. Bagging com DecisionTree ############################################################

            # numero do modelo na tabela
            num_model = 0

            # modelo
            bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                   max_samples=pct_trainamento[i],
                                   max_features=1.0,
                                   n_estimators=qtd_modelos)
            # treinando o modelo
            bg.fit(x_train, y_train)

            # computando a previsao
            pred = bg.predict(x_test)

            # printando os resultados
            acuracia, auc, f1measure, gmean = printar_resultados(
                y_test, pred,
                nome_datasets[h] + '-pct-' + str(pct_trainamento[i]) +
                '- Bagging com DecisionTree [' + str(j) + ']')

            # escrevendo os resultados obtidos
            tabela.Adicionar_Sheet_Linha(num_model, j,
예제 #29
0
                plt.figure()
                plt.barh(pos, feature_importance[sorted_idx], align='center')
                plt.yticks(pos, BigFeaturenames[sorted_idx])
                plt.xlabel('Relative Importance')
                plt.title('Variable Importance based on bagging method')
                plt.show()
            except:
                print('不展示特征重要性')

    if 0:
        print(' '.join(['*' * 25, 'RandomForestClassifier', '*' * 25, '\n']))
        from sklearn.ensemble.bagging import BaggingClassifier
        clf_svm0=SVC(C=10,kernel='rbf',gamma=0.1,probability=True,\
                decision_function_shape='ovr',random_state=seed,class_weight='balanced')
        pipe_svm0 = Pipeline([('scaler', Scaler()), ('clf', clf_svm0)])
        clf_bg = BaggingClassifier(base_estimator=pipe_svm0, n_estimators=10, max_samples=1.0, \
                                   max_features=1.0, random_state=seed)
        start = time.time()
        clf_bg = clf_bg.fit(X_train, Y_train)
        print('Total running time is {}s'.format(time.time() - start))
        judge = cross_val_score(clf_bg,
                                X,
                                Y,
                                groups=None,
                                scoring=Evaluate(score_func),
                                cv=5)
        #print('Cross-validation score is {}'.format(judge))
        print('Mean cross-validation score is {}'.format(judge.mean()))

    # Boosting method:GradBoost
    if 1:
        print(' '.join(