def runKNN(data, target): folds = [10] depths = [10] print("------------ NB ------------") mms = MinMaxScaler() stdsc = StandardScaler() datamms = mms.fit_transform(data) datastdsc = stdsc.fit_transform(data) for fold in folds: print('fold = %d ' % fold) for depth in depths: knn = KNeighborsClassifier() testpredict, testtarget = cross_val_pred2ict(knn, data, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) testpredict, testtarget = cross_val_pred2ict(knn, datamms, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) testpredict, testtarget = cross_val_pred2ict(knn, datastdsc, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget)
def runNB(data, target): folds = [10] depths = [10] print("------------ NB ------------") mms = MinMaxScaler() stdsc = StandardScaler() datamms = mms.fit_transform(data) datastdsc = stdsc.fit_transform(data) for fold in folds: print('fold = %d ' % fold) for depth in depths: nvb = GaussianNB() testpredict, testtarget = cross_val_pred2ict(nvb, data, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) nvb = GaussianNB() testpredict, testtarget = cross_val_pred2ict(nvb, datamms, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) nvb = GaussianNB() testpredict, testtarget = cross_val_pred2ict(nvb, datastdsc, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget)
def runvoting(data, target): folds = [10] for fold in folds: print('fold = %d ' % fold) clf1 = KNeighborsClassifier(n_neighbors=5) clf2 = tree.DecisionTreeClassifier(random_state=1) clf3 = GaussianNB() skf = StratifiedKFold(n_splits=fold, random_state=2) eclf1 = VotingClassifier(estimators=[('Ada', clf1), ('RandomForest', clf2), ('SVM', clf3)], voting='hard') eclf2 = VotingClassifier(estimators=[('Ada', clf1), ('RandomForest', clf2), ('SVM', clf3)], voting='soft') for clf, label in zip( [clf1, clf2, clf3, eclf1], ['Ada', 'RandomForest', 'SVM RBF', 'ESEMBLE HARD', 'ESEMBLE SOFT' ]): testpredict, testtarget = cross_val_pred2ict(clf, data, target, cv=skf.get_n_splits( data, target), n_jobs=-1) print("--------------------------") print(label) print_scores(testpredict, testtarget) eclf1.fit(data, target)
def runadatree(data, target): folds = [10] depths = [10, 100, 1000] estimators = [100, 1000] for fold in folds: print('fold = %d ' % fold) for depth in depths: for estimator in estimators: matrices1 = [] matrices2 = [] print('depth = %d ' % depth) print('estimators = %d ' % estimator) clf = tree.DecisionTreeClassifier(max_depth=depth) skf = StratifiedKFold(n_splits=fold, random_state=5) adaboosting = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=depth), n_estimators=estimator) testpredict, testtarget = cross_val_pred2ict(adaboosting, data, target, cv=skf, n_jobs=-1) if len(testpredict) != len(testtarget): raise ValueError('length score and target are different!') for pr, tar in zip(testpredict, testtarget): matrices1.append(confusion_matrix(tar, pr)) precision1st = precision(matrices1) sesnivitivity1st = sensitivity(matrices1) print("Accuracy: %r" % str(accuracy(matrices1))) print("Precision: %r" % str(precision1st)) print("Recall: %r" % str(sesnivitivity1st)) print("f1") print(f1tpfp(matrices1)) print(f1prre(precision1st, sesnivitivity1st)) print(f1avg(matrices1)) for matrix in matrices1: matrices2.append( np.array([[matrix[1, 1], matrix[1, 0]], [matrix[0, 1], matrix[0, 0]]])) precision2st = precision(matrices2) sesnivitivity2st = sensitivity(matrices2) print("Accuracy: %r" % str(accuracy(matrices2))) print("Precision: %r" % str(precision2st)) print("Recall: %r" % str(sesnivitivity2st)) print("f1") print(f1tpfp(matrices2)) print(f1prre(precision2st, sesnivitivity2st)) print(f1avg(matrices2))
def runstacking(data, target): folds = [10] for fold in folds: print('fold = %d ' % fold) clf1 = KNeighborsClassifier(n_neighbors=5) clf2 = tree.DecisionTreeClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression(C=10.0) sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) params = { 'kneighborsclassifier__n_neighbors': [1, 5], 'decisiontreeclassifier__max_depth': [1, 10, 50], 'meta-logisticregression__C': [0.1, 10.0] } grid = GridSearchCV(estimator=sclf, param_grid=params, cv=10, refit=True) grid.fit(data, target) cv_keys = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print("%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r])) print('Best parameters: %s' % grid.best_params_) print('Accuracy: %.2f' % grid.best_score_) skf = StratifiedKFold(n_splits=fold, random_state=2) eclf1 = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) for clf, label in zip( [clf1, clf2, clf3, eclf1], ['Ada', 'RandomForest', 'SVM RBF', 'ESEMBLE HARD', 'ESEMBLE SOFT' ]): testpredict, testtarget = cross_val_pred2ict(clf, data, target, cv=skf.get_n_splits( data, target), n_jobs=-1) print("--------------------------") print(label) print_scores(testpredict, testtarget)
def runsvcn(data, target): folds = [10] kernels = ['rbf'] print("------------ SVM ------------") mms = MinMaxScaler() stdsc = StandardScaler() datamms = mms.fit_transform(data) datastdsc = stdsc.fit_transform(data) for fold in folds: print('fold = %d ' % fold) for kernel in kernels: print('----- KERNEL = %s -----' % kernel) matrices1 = [] matrices2 = [] skf = StratifiedKFold(n_splits=fold, random_state=5) svc = svm.SVC(C=1, kernel=kernel) svc.set_params() testpredict, testtarget = cross_val_pred2ict(svc, data, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) testpredict, testtarget = cross_val_pred2ict(svc, datamms, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget) testpredict, testtarget = cross_val_pred2ict(svc, datastdsc, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget)
def runforest(data, target): folds = [10] estimators = [100] print("------------ RANDOM FOREST ------------") for fold in folds: print('fold = %d ' % fold) for estimator in estimators: print('estimators = %d ' % estimator) clf = RandomForestClassifier(n_estimators=estimator) skf = StratifiedKFold(n_splits=fold, random_state=5) testpredict, testtarget = cross_val_pred2ict(clf, data, target, cv=skf.get_n_splits( data, target), n_jobs=-1) print_scores(testpredict, testtarget)
def runbaggingtree(data, target): folds = [3] depths = [5] estimators = [50] for fold in folds: print('fold = %d ' % fold) for depth in depths: for estimator in estimators: print('depth = %d ' % depth) print('estimators = %d ' % estimator) skf = StratifiedKFold(n_splits=fold, random_state=5) bagging = BaggingClassifier(KNeighborsClassifier(), n_estimators=estimator) testpredict, testtarget = cross_val_pred2ict(bagging, data, target, cv=10, n_jobs=-1) print_scores(testpredict, testtarget)
print('Klasa: %s' % data) importdata.print_info(db.target) rows = [] for i in range(5): rows.append([data]) # obliczenia dla kazdego klasyfikatora for clf in clfs: scores = [] # powtarzanie klasyfikacji for iteration in range(iterations): clf_ = clone(clf) # sprawdzian krzyzowy testpredict, testtarget = cross_val_pred2ict(clf_, db.data, db.target, cv=folds, n_jobs=-1) scores.append(accsespf1g(testpredict, testtarget)) print(str(clf)) print_scores(testpredict, testtarget) # usrednanie wynikow avgscores = avgaccsespf1g(scores) to_decimal = print_to_latex_two_decimal(avgscores) for i, score in enumerate(to_decimal): rows[i].append(score) for table, row in zip(tables, rows): print(row) table.add_row(row) table.add_hline()
def fit(self, X, y): if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') # klonowanie self.estimators_ = [ clone(estimator) for _, estimator in self.estimators ] cv_predictions = [] targets = [] self.groups = np.unique(y) # ocena klasyfikatorow for estimator in self.estimators_: testpredict, testtarget = cross_val_pred2ict(estimator, X, y, cv=self.n_folds, n_jobs=1) cv_predictions.append((testpredict)) targets.append(testtarget) # wylanianie ekspertow w swojej klasie for idx, (prediction, target) in enumerate(zip(cv_predictions, targets)): matrixes1 = [] matrixes2 = [] for pred, tar in zip(prediction, target): matrixes1.append(simplefunctions.confusion_matrix(tar, pred)) for matrix in matrixes1: matrixes2.append( np.array([[matrix[1, 1], matrix[1, 0]], [matrix[0, 1], matrix[0, 0]]])) class1 = getattr(simplefunctions, self.function_compare)(matrixes1) if class1 > self.max_rating[0]: self.max_rating[0] = class1 self.experts[0] = (idx) self.g_mean[0] = simplefunctions.g_meantpfp(matrixes1) elif class1 == self.max_rating[0]: self.experts[0] = (idx) self.g_mean[0] = simplefunctions.g_meantpfp(matrixes1) class2 = getattr(simplefunctions, self.function_compare)(matrixes2) if class2 > self.max_rating[1]: self.max_rating[1] = class2 self.experts[1] = idx self.g_mean[1] = simplefunctions.g_meantpfp(matrixes1) elif class1 == self.max_rating[1]: self.experts[1] = idx self.g_mean[1] = simplefunctions.g_meantpfp(matrixes1) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), X, y) for _, clf in self.estimators) return self
def cross_val_oversampling_before(data, target): print("Testowanie CV bez oversamplingu") # skalowanie dla SVM i kNN stdsc = StandardScaler() datastdsc = stdsc.fit_transform(data) rows_normal = [] rows_stand = [] # klasyfikator NB i tree for clf in clfs_normal: print('Klasyfikator: %s' % clf[0]) clf_ = clone(clf[1]) # CV testpredict, testtarget = cross_val_pred2ict(clf_, data, target, cv=folds, n_jobs=-1) print_scores(testpredict, testtarget) row = [] row.extend(print_to_latex_sespf1g(testpredict, testtarget)) # roc testroc = cross_val_predict(clf_, data, target, cv=folds, n_jobs=-1, method='predict_proba') row.append( float("{0:.2f}".format( roc_auc_score(y_true=target, y_score=testroc[:, 1])))) rows_normal.extend(row) # klasyfikator SVM i kNN for clf in clfs_stand: print('Klasyfikator: %s' % clf[0]) clf_ = clone(clf[1]) # CV testpredict, testtarget = cross_val_pred2ict(clf_, datastdsc, target, cv=folds, n_jobs=-1) print_scores(testpredict, testtarget) row = [] row.extend(print_to_latex_sespf1g(testpredict, testtarget)) # roc testroc = cross_val_predict(clf_, datastdsc, target, cv=folds, n_jobs=-1, method='predict_proba') row.append( float("{0:.2f}".format( roc_auc_score(y_true=target, y_score=testroc[:, 1])))) rows_stand.extend(row) return rows_normal, rows_stand
def runtree(data, target): sm = SMOTEENN() X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=5, stratify=target) print(y_test.size) print(np.bincount(y_test)) X_resampled, y_resampled = sm.fit_sample(X_train, y_train) folds = [10] depths = [10] print("------------ TREE ------------") for fold in folds: print('fold = %d ' % fold) for depth in depths: print('depth = %d ' % depth) clf = GaussianNB() skf = StratifiedKFold(n_splits=fold, random_state=5) testpredict, testtarget = cross_val_pred2ict(clf, data, target, cv=fold, n_jobs=-1) print_scores(testpredict, testtarget) print('smotens - przecenione') testpredict, testtarget = cross_val_pred2ict(clf, X_resampled, y_resampled, cv=fold, n_jobs=-1) print_scores(testpredict, testtarget) testpredict = cross_val_predict(clf, data, target, cv=fold, n_jobs=-1, method='predict_proba') print(roc_auc_score(y_true=target, y_score=testpredict[:, 1])) testpredict = cross_val_predict(clf, X_resampled, y_resampled, cv=fold, n_jobs=-1, method='predict_proba') print(roc_auc_score(y_true=y_resampled, y_score=testpredict[:, 1])) print('smotens - na czesci') clf.fit(X_resampled, y_resampled) print_scores([clf.predict(X_test)], [y_test]) # print(roc_auc_score(y_true=y_test, y_score=clf.predict_proba(X_test)[:, 1])) print('smotens - wlasciwe') clf_train = KNeighborsClassifier() predict_re = [] targets_re = [] proba_re = [] target_proba_re = [] for train_index, test_index in skf.split(data, target): clf_train_ = clone(clf_train) data_re, tar_re = sm.fit_sample(data[train_index], target[train_index]) clf_train_.fit(data_re, tar_re) predict_re.append(clf_train_.predict(data[test_index])) targets_re.append(target[test_index]) proba_re.extend( clf_train_.predict_proba(data[test_index])[:, 1]) target_proba_re.extend(target[test_index]) print_scores(predict_re, targets_re) # print(test_re) # print(proba_re) print(roc_auc_score(y_true=target_proba_re, y_score=proba_re))
def fit(self, X, y): if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') cv_predictions = [] targets = [] # klonowanie self.estimators_ = [ clone(estimator) for _, estimator in self.estimators ] # dodawanie klasyfikatorow AdaBoost for clf in self.estimators_ada: self.clfs.append( AdaBoostClassifier(clone(clf), n_estimators=self.n_estimators)) # dodawanie klasyfikatorow Bagging for clf in self.estimators_bag: self.clfs.append( BaggingClassifier(clone(clf), n_estimators=100, max_samples=0.9)) self.clfs.append( StackingClassifier(classifiers=self.estimators_, meta_classifier=LogisticRegression())) self.clfs.append(clf_expert(self.estimators)) # ocena klasyfikatorow for clf in self.clfs: testpredict, testtarget = cross_val_pred2ict(clf, X, y, cv=self.n_folds, n_jobs=1) cv_predictions.append((testpredict)) targets.append(testtarget) skf = StratifiedKFold(n_splits=2, random_state=self.random_st) # trenowanie i ocenianie klasyfiktorow dla zbioru SMOTE i NCR for clf in self.clfs: for method, name in zip(self.methoda, self.name_met): metodaa = SMOTE(k_neighbors=3, random_state=self.random_st) metodaj = NeighbourhoodCleaningRule( n_neighbors=3, random_state=self.random_st) predict_re = [] targets_re = [] for train_index, test_index in skf.split(X, y): if method == 0: data_re, tar_re = metodaa.fit_sample( np.asarray(X[train_index]), np.asarray(y[train_index])) else: data_re, tar_re = metodaj.fit_sample( np.asarray(X[train_index]), np.asarray(y[train_index])) clf_ = clone(clf) # trenowanie clf_.fit(data_re, tar_re) # testowanie predict_re.append(clf_.predict(X[test_index])) targets_re.append(y[test_index]) cv_predictions.append((predict_re)) targets.append(targets_re) # wylanianie 2 najlepszych ekspertow for idx, (prediction, target) in enumerate(zip(cv_predictions, targets)): matrixes1 = [] matrixes2 = [] for pred, tar in zip(prediction, target): matrixes1.append(simplefunctions.confusion_matrix(tar, pred)) for matrix in matrixes1: matrixes2.append( np.array([[matrix[1, 1], matrix[1, 0]], [matrix[0, 1], matrix[0, 0]]])) fun_cmp = getattr(simplefunctions, self.function_compare)(matrixes1) if fun_cmp > self.max_g[0]: self.clf_id[1] = self.clf_id[0] self.clf_id[0] = idx self.max_g[1] = self.max_g[0] self.max_g[0] = fun_cmp elif fun_cmp > self.max_g[1]: self.clf_id[2] = self.clf_id[1] self.clf_id[1] = idx self.max_g[2] = self.max_g[1] self.max_g[0] = fun_cmp elif fun_cmp > self.max_g[2]: self.clf_id[2] = idx self.max_g[2] = fun_cmp for clf_id in self.clf_id: if clf_id > len(self.estimators_ada) + len(self.estimators_bag): if clf_id % 2 == 0: met = self.methods[0] data_re, tar_re = met.fit_sample(X, y) clf_ = clone(self.clfs[(clf_id - 7) / 2]) self.ensemble_.append(clf_.fit(data_re, tar_re)) else: met = self.methods[1] data_re, tar_re = met.fit_sample(X, y) clf_ = clone(self.clfs[(clf_id - 7) / 2]) self.ensemble_.append(clf_.fit(data_re, tar_re)) else: clf_ = clone(self.clfs[clf_id]) self.ensemble_.append(clf_.fit(X, y)) meta_features = self._predict_meta_features(X) self.meta_clf_.fit(meta_features, y)