def classifier_list(): clfs = {} ### Forests clfs['grf'] = RandomForestClassifier(n_jobs=4, criterion='gini') clfs['erf'] = RandomForestClassifier(n_jobs=4, criterion='entropy') clfs['etr'] = ExtraTreesClassifier() ### Boosting # clfs['gbc'] = GradientBoostingClassifier() # clfs['ada'] = AdaBoostClassifier() # clfs['bag'] = BaggingClassifier() ### SVM clfs['lsvm'] = LinearSVC() # clfs['qsvm'] = SVC(probability=True, kernel='poly', degree=2) # Slow # clfs['psvm'] = SVC(probability=True, kernel='poly', degree=3) # Slow # clfs['ssvm'] = SVC(probability=True, kernel='sigmoid') # Slow # clfs['rsvm'] = SVC(probability=True, kernel='rbf') # Slow ### Naive Bayes # clfs['gnb'] = GaussianNB() # Worst clfs['bnb'] = BernoulliNB() # Good clfs['mnb'] = MultinomialNB() # Best # ### Decision Tree (CART) clfs['gdt'] = DecisionTreeClassifier(criterion='gini') clfs['edt'] = DecisionTreeClassifier(criterion='entropy') clfs['egt'] = ExtraTreeClassifier(criterion='gini') clfs['eet'] = ExtraTreeClassifier(criterion='entropy') return clfs
def __init__(self, data, target): self.data = data if len(target.shape) == 2: # Convert 2-dim target array into 1-dim target array self.target = target.reshape(target.shape[0]) else: self.target = target self.training_data = None self.training_target = None self.test_data = None self.test_target = None # Construct 3 Tier-1 (base) classifiers self.Tier1_classifier1 = LogisticRegression(solver="lbfgs") self.Tier1_classifier2 = MultinomialNB() self.Tier1_classifier3 = LinearSVC(penalty="l2") self.Tier1_classifier4 = ExtraTreeClassifier() # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3) # Construct Tier-2 (meta) classifier # self.meta_classifier = LogisticRegression(solver="lbfgs") # self.meta_classifier = MultinomialNB() # self.meta_classifier = LinearSVC(penalty = "l2") self.meta_classifier = ExtraTreeClassifier()
def dTree(data, labels, test, impurity="gini", mdepth=None): newData = pd.DataFrame() newTest = pd.DataFrame() le = LabelEncoder() for datum in data: newData[datum] = le.fit_transform(data[datum]) for testItem in test: newTest[testItem] = le.fit_transform(test[testItem]) tree1 = DecisionTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree2 = ExtraTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree3 = RandomForestClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree1.fit(newData, labels) tree2.fit(newData, labels) tree3.fit(newData, labels) predict1 = tree1.predict(newTest) print("tree1", evaluate(predict1, validation_genres)) predict2 = tree2.predict(newTest) print("tree2", evaluate(predict2, validation_genres)) predict3 = tree3.predict(newTest) print("tree3", evaluate(predict3, validation_genres)) combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1]) return combined_prediction
def evaluate_optimal_classifier(features, classes): # Obtain the classifier for the current experiment et = ExtraTreeClassifier(criterion='entropy',max_leaf_nodes=1200,min_samples_leaf=1,max_depth=7,random_state=5) classifier = AdaBoostClassifier(base_estimator=et,n_estimators=500,algorithm='SAMME',learning_rate=0.1,random_state=5) # Split the data set into training and test set train_X, test_X, train_Y, test_Y = train_test_split(features, classes, test_size=0.2) # Fit the training data to the model classifier.fit(train_X, train_Y) # Predict the classes of the test set predicted_classes = classifier.predict(test_X) # Compute the confusion matrix matrix = confusion_matrix(test_Y, predicted_classes) print("Confusion matrix:") print(matrix) # Compute the weighted F1-Score f1_measure = f1_score(test_Y, predicted_classes, average="weighted") print("Weighted F1-Score: {:0.3f}".format(f1_measure)) # Compute a classification report report = classification_report(test_Y, predicted_classes) print("Classification report:") print(report)
def init_classifiers(seed): return { 'AdaBoostClassifier': AdaBoostClassifier(random_state=seed), 'BaggingClassifier': BaggingClassifier(random_state=seed), 'ExtraTreesClassifier': ExtraTreesClassifier(random_state=seed), 'GradientBoostingClassifier': GradientBoostingClassifier(random_state=seed), 'RandomForestClassifier': RandomForestClassifier(random_state=seed), 'XGBClassifier': xgb.XGBClassifier(), 'LogisticRegression': LogisticRegression(random_state=seed), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(random_state=seed), 'RidgeClassifier': RidgeClassifier(random_state=seed), 'RidgeClassifierCV': RidgeClassifierCV(), 'SGDClassifier': SGDClassifier(random_state=seed), #'KNeighborsClassifier': KNeighborsClassifier(), #'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'MLPClassifier': MLPClassifier(random_state=seed), 'DecisionTreeClassifier': DecisionTreeClassifier(random_state=seed), 'ExtraTreeClassifier': ExtraTreeClassifier(random_state=seed) }
def main(): np.random.seed(20) def scorer(est, x, y): y_hat = est.predict(x) return classification.accuracy_score(y, y_hat) #x, y = make_classification(n_samples=1000, n_classes=4, n_informative=10) x, y = fetch_kddcup99(return_X_y=True) x = np.array(x[:, 4:], dtype=np.float32) y = preprocessing.LabelEncoder().fit_transform(y) # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) myclf = Mree(split_method=greedy_classification) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine greedy classification result", score, np.mean(score)) myclf = Mree(split_method=greedy_classification_p_at_k) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine greedy p@k classification result", score, np.mean(score)) myclf = Mree(split_method=random_classify_p_at_k) score = cross_val_score(myclf, x, y, cv=5, scoring=scorer) print("Mine random p@k classification result", score, np.mean(score)) clf = DecisionTreeClassifier(max_depth=10, max_features=20, min_impurity_decrease=0.000001) score = cross_val_score(clf, x, y, cv=5, scoring=scorer) print("Sklearn greedy classification result", score, np.mean(score)) clf = ExtraTreeClassifier(max_depth=10, max_features=20, min_impurity_decrease=0.000001) score = cross_val_score(clf, x, y, cv=5, scoring=scorer) print("sklearn random classification result", score, np.mean(score))
def serialize_class(self): """ Convert to hdf5 """ clf = SVC(C=3.0, kernel='poly', degree=5) clf = SVR() clf = LinearSVC(loss='hinge', tol=0.001, C=2.0) clf = LinearRegression(fit_intercept=True, n_jobs=2) clf = GaussianNB() clf = SGDClassifier(loss='hinge', learning_rate='optimal', alpha=0.0001) clf = KNeighborsClassifier(n_neighbors=6, weights='uniform', algorithm='ball_tree', leaf_size=32) #clf = RadiusNeighborsClassifier() clf = GradientBoostingClassifier(n_estimators=100) clf = ExtraTreeClassifier() clf = DecisionTreeClassifier(criterion='entropy', random_state=42) clf = DecisionTreeRegressor() # clf = ExtraTreeRegressor() #clf = GradientBoostingClassifier(n_estimators=10) clf = AdaBoostClassifier(n_estimators=2) #clf = AdaBoostRegressor() #clf = BaggingClassifier() #clf = BaggingRegressor() #clf = ExtraTreesClassifier(n_estimators=1) #clf = ExtraTreesRegressor() #clf = RandomForestClassifier() classifier, X_test, y_test, X = self.train_model(clf) print("Serializing...") self.save_model(classifier) return X_test, y_test, classifier
def GET_ALLKINDS_MODELS(prediction_type=None): if prediction_type == "C": return { "LR": LogisticRegression(), "LDA": LinearDiscriminantAnalysis(), "GNB": GaussianNB(), "KNC": KNeighborsClassifier(), "SVC": SVC(), "ETC": ExtraTreeClassifier(), "DTC": DecisionTreeClassifier(), "ETC_Ensemble": ExtraTreesClassifier(), "RFC_Ensemble": RandomForestClassifier(), "ABC_Ensemble": AdaBoostClassifier(), "GBC_Ensemble": GradientBoostingClassifier() } elif prediction_type == "R": return { "LR": LinearRegression(), "RIDGE": Ridge(), "LASSO": Lasso(), "EN": ElasticNet(), "KNR": KNeighborsRegressor(), "SVR": SVR(), "ETR": ExtraTreeRegressor(), "DTR": DecisionTreeRegressor(), "ETR_Ensemble": ExtraTreesRegressor(), "RFR_Ensemble": RandomForestRegressor(), "ABR_Ensemble": AdaBoostRegressor(), "GBR_Ensemble": GradientBoostingRegressor() } else: raise Exception()
def define_models(models=dict()): # linear models models['logistic'] = LogisticRegression()#models['logistic'] = key and LogisticRegression = value alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] #trying different configuration of ridge: for a in alpha: models['ridge-'+str(a)] = RidgeClassifier(alpha=a) models['sgd'] = SGDClassifier(max_iter=1000, tol=1e-3) models['pa'] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3) # non-linear models n_neighbors = range(1, 21) #trying dgbm videoffernt configuration of models for k in n_neighbors: models['knn-'+str(k)] = KNeighborsClassifier(n_neighbors=k) models['cart'] = DecisionTreeClassifier() models['extra'] = ExtraTreeClassifier() models['svml'] = SVC(kernel='linear') models['svmp'] = SVC(kernel='poly') c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for c in c_values: models['svmr'+str(c)] = SVC(C=c) models['bayes'] = GaussianNB() # ensemble models n_trees = 100 models['ada'] = AdaBoostClassifier(n_estimators=n_trees) models['bag'] = BaggingClassifier(n_estimators=n_trees) models['rf'] = RandomForestClassifier(n_estimators=n_trees) models['et'] = ExtraTreesClassifier(n_estimators=n_trees) models['gbm'] = GradientBoostingClassifier(n_estimators=n_trees) print('Defined %d models' % len(models)) return models
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest): """ Applies decision tree algorithm on the dataset, by tuning various parameters Args: dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied """ # fit a CART model to the data etc = ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='random') etc.fit(trainData, targetTrain) print(etc) # make predictions expected = targetTest predicted = etc.predict(testData) # summarize the fit of the model print(accuracy_score(expected, predicted))
def get_hyperparameters_model(): criterion = ['gini', 'entropy'] splitter = ['best', 'random'] max_depth = [20, 100] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] max_features = ['auto', 'sqrt', 'log2', None] class_weight = ['balanced', None] param_dist = { 'cls__criterion': criterion, 'cls__splitter': splitter, 'cls__max_depth': max_depth, 'cls__min_samples_split': min_samples_split, 'cls__min_samples_leaf': min_samples_leaf, 'cls__max_features': max_features, 'cls__class_weight': class_weight } clf = ExtraTreeClassifier() model = { 'extra_tree_classifier': { 'model': clf, 'param_distributions': param_dist } } return model
def Extra_Tree(self, X_train, y_train, X_test, y_test): max_depth = [5, 10, 25, 50, 75, 100] min_samples_leaf = [1, 2, 4, 8, 10] min_samples_split = [2, 4, 6, 8, 10] max_features = ["auto", "sqrt", "log2", None] criterion = ["gini", "entropy"] splitter = ["best", "random"] hyperparameter = {"max_depth": max_depth, "min_samples_leaf": min_samples_leaf, "min_samples_split": min_samples_split, "max_features": max_features, "criterion": criterion, "splitter": splitter} n_folds = 10 my_cv = TimeSeriesSplit(n_splits = n_folds).split(X_train) et = ExtraTreeClassifier(random_state = 42) rsearch_cv = RandomizedSearchCV(estimator = et, param_distributions = hyperparameter, n_iter = 50, scoring = "f1_macro", n_jobs = -1, cv = my_cv, random_state = 42) rsearch_cv.fit(X_train, y_train) et_best = rsearch_cv.best_estimator_ et_best.fit(X_train, y_train) y_pred = et_best.predict(X_test) test_accuracy = accuracy_score(y_test, y_pred, normalize=True) * 100 precision = np.round(metrics.precision_score(y_test, y_pred, average="macro"), 4) recall = np.round(metrics.recall_score(y_test, y_pred, average="macro"), 4) f1 = np.round(metrics.f1_score(y_test, y_pred, average="macro"), 4) return et_best, test_accuracy, precision, recall, f1
def setUpClass(cls): np.random.seed(seed=1234) cls.sklearn_model = ExtraTreeClassifier() cls.classifier = ScikitlearnExtraTreeClassifier( model=cls.sklearn_model) cls.classifier.fit(x=x_train, y=y_train)
def extratree(typ, X_train, Y_train, X_test, Y_test, text): text.delete(1.0, tk.END) text.insert( tk.END, "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...", "bold") text.update_idletasks() from sklearn.tree import ExtraTreeClassifier ETC = ExtraTreeClassifier() ETC.fit(X_train, Y_train) Y_pred = ETC.predict(X_test) text.insert( tk.END, "\n\nExtra Tree Classifier report \n" + classification_report(Y_pred, Y_test), "bold") text.insert( tk.END, "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test), "bold") text.insert( tk.END, "Extra Tree Classifier confusion matrix \n" + str(confusion_matrix(Y_pred, Y_test)), "bold") score = accuracy_score(Y_pred, Y_pred) text.insert(tk.END, "Extra tree score= ", score) text.update_idletasks() roc_curve_acc(Y_test, Y_pred, 'ETC') if typ == "s": plt.show() elif typ == "a": pass
def start(self): """ 01. Initialise the data paths and transformation functions. """ self.data_dir = '../data/raw_data' self.trans_primitives = ['weekday', 'hour', 'time_since_previous'] self.agg_primitives = [ 'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last', 'time_since_last', 'mode' ] self.ignore_cols = [ 'num_contacts', 'num_referrals', 'num_successful_referrals' ] self.feature_windows = [10, 30, 60, 90] #[10,20,30] self.max_feature_depth = 2 # list of estimators to use self.estimators = [ ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()), ('gbc', GradientBoostingClassifier(validation_fraction=0.15, n_iter_no_change=50)), ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()), ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()), ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()), ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()), ('dtc', DecisionTreeClassifier()), ('qda', QuadraticDiscriminantAnalysis()), ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)), ('knn6', KNeighborsClassifier(6)), ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()), ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()), ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()), ('sgdc', SGDClassifier()), ('svg', SVC()), ('ngbc', NGBClassifier(Dist=Bernoulli)) ] self.next(self.load_raw_data)
def __init__(self): # 알고리즘 이름 self._name = 'extratree' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/classifier/resource/classifier_sample.csv", sep=",", encoding="utf-8") # 학습 및 레이블(정답) 데이터 분리 self._x = data.drop("quality", axis=1) self._y = data["quality"] # 학습 데이터 및 테스트 데이터 분리 self._x_train, self._x_test, self._y_train, self._y_test = train_test_split( self._x, self._y, test_size=0.2, shuffle=True, random_state=42) # 모델 선언 self._model = ExtraTreeClassifier() # 모델 학습 self._model.fit(self._x_train, self._y_train)
def clf_scan(xtrain, ytrain, xtest=None, ytest=None, cv=5): """ Function to perform k-fold cross validation on some standard classifiers. Note, it may take a long time for some of the classifiers to converge on un-scaled data. Use un-scaled data with caution. :return: results: Library with classifier names and scores :param xtrain: Matrix of features from the training set :param ytrain: Class labels from the training set. :param cv: # of folds to use during k-folds cross validation of each model. :param xtest: Matrix of features from the testing set :param ytest: Class labels from the testing set :return: results: Library with classifier names and scores """ clfs = { 'LogisticRegression': LogisticRegression(), 'MLPClassifier': MLPClassifier(), 'LinearDicriminantAnalysis': LinearDiscriminantAnalysis(), 'SGD Classifier': SGDClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50), 'GradientBoostClassifier': GradientBoostingClassifier(), 'SVC(rbf)': SVC(kernel='rbf', probability=True), 'KNearestNeighbors': KNeighborsClassifier(), 'ExtraTreesClassifier': ExtraTreeClassifier(), 'RandomForestClassifier': RandomForestClassifier(n_estimators=50) } results = {} print('\n====== > Evaluation cross validation scores') for name, clf in clfs.items(): print('==> Current estimator:\n%s\n' % clf) scores = cross_val_score(clf, xtrain, ytrain, cv=cv) results[name] = scores # for name, scores in results.items(): for name in clfs.keys(): print("%25s :: Accuracy: %0.3f%% (+/0 %0.3f%%)" % (name, 100 * results[name].mean(), 100 * results[name].std() * 2)) if (xtest is not None) and (ytest is not None): test_results = {} cohen_kappa_results = {} print('=========================================================') print('Performing model fits on training/testing data.') for name, clf in clfs.items(): print('Processing %30s' % name) try: clf.fit(xtrain, ytrain) test_score = clf.score(xtest, ytest) test_results[name] = test_score y_pred = clf.predict(xtest) kappa = cohen_kappa_score(ytest, y_pred) cohen_kappa_results[name] = kappa except Exception as e: print('Error encountered calculating score on test data for %s. It may not have a built-in' '.score attribute!' % name) print('Exception: ', e) for name in clfs.keys(): print("%25s :: Accuracy: %0.3f%%\n" "%25s :: Cohen's Kappa: %0.3f" % (name, 100 * test_results[name], " ", cohen_kappa_results[name])) return results
def define_models(models=dict()): # linear models models["logistic"] = LogisticRegression() alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for value in alpha: models["ridge-" + str(value)] = RidgeClassifier(alpha=value) models["sgd"] = SGDClassifier(max_iter=1000, tol=1e-3) models["pa"] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3) # non-linear models n_neighbors = range(1, 21) for k in n_neighbors: models["knn-" + str(k)] = KNeighborsClassifier(n_neighbors=k) models["cart"] = DecisionTreeClassifier() models["extra"] = ExtraTreeClassifier() models["svml"] = SVC(kernel="linear") models["svmp"] = SVC(kernel="poly") c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for c in c_values: models["svmr" + str(c)] = SVC(C=c) models["bayes"] = GaussianNB() # ensemble models n_trees = 100 models["ada"] = AdaBoostClassifier(n_estimators=n_trees) models["bag"] = BaggingClassifier(n_estimators=n_trees) models["rf"] = RandomForestClassifier(n_estimators=n_trees) models["et"] = ExtraTreesClassifier(n_estimators=n_trees) models["gbm"] = GradientBoostingClassifier(n_estimators=n_trees) print("Defined %d models" % len(models)) return models
def setUp(self): super().setUp() self.model = ExtraTreeClassifier() iris = load_iris() X = iris.data.astype(np.float32) y = iris.target.astype(np.int32) self.model.fit(X, y)
def get_experiment_5(): et = ExtraTreeClassifier(criterion='entropy',max_leaf_nodes=1200,min_samples_leaf=1,random_state=5) classifier = AdaBoostClassifier(base_estimator=et,n_estimators=500,algorithm='SAMME',learning_rate=0.1,random_state=5) param_grid = { 'base_estimator__max_depth': [5,7] } return (classifier, param_grid)
def train_different_clf(data): X = data.as_matrix(features) y = data[label].values ### split the data features_train, features_test, labels_train, labels_test = train_test_split( X, y, test_size=0.2, random_state=42) print('NB') model = GaussianNB() validate_model(model, X, y, features_train, labels_train, features_test, labels_test) print('DTC') model = DecisionTreeClassifier() validate_model(model, X, y, features_train, labels_train, features_test, labels_test) print('ETC') model = ExtraTreeClassifier() validate_model(model, X, y, features_train, labels_train, features_test, labels_test) print('K Neighbors') model = KNeighborsClassifier() validate_model(model, X, y, features_train, labels_train, features_test, labels_test)
def test_grid_search(): from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier grid = { 'base_estimator': [ DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4), ExtraTreeClassifier(max_depth=4) ], 'learning_rate': [0.01, 0.1, 0.5, 1.], 'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125], 'algorithm': ['SAMME', 'SAMME.R'] } grid = OrderedDict(grid) trainX, trainY = generate_sample(2000, 10, distance=0.5) grid_cv = GridOptimalSearchCV(AdaBoostClassifier(), grid, n_evaluations=10, refit=True, log_name='test') grid_cv.fit(trainX, trainY) grid_cv.predict_proba(trainX) grid_cv.predict(trainX) grid_cv.print_param_stats([0.1, 0.3, 0.5, 0.7])
def setUpClass(cls): master_seed(seed=1234) super().setUpClass() cls.sklearn_model = ExtraTreeClassifier() cls.classifier = ScikitlearnExtraTreeClassifier(model=cls.sklearn_model) cls.classifier.fit(x=cls.x_train_iris, y=cls.y_train_iris)
def variables_relevantes_arbol(X, Y, alpha=None): if len(X) == 0: logger.info("No se ingreso informacion de variables") return [] features = list(X.columns) if alpha == None: alpha = 1.0 / len(features) logger.info( 'Se calcula el valor minimo de aceptacion de importancia: {0}'. format(alpha)) try: model = ExtraTreeClassifier() model.fit(X, Y) importance = model.feature_importances_ relevant_features = [] for i in range(len(features)): if importance[i] > alpha: relevant_features.append(features[i]) except Exception as e: logger.info( 'Error con el metodo de arboles, no se determinaron variables relevantes: {0}' .format(e)) relevant_features = [] return importance, relevant_features
def __init__(self, n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto", max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False): super(ExtraTreesClassifier, self).__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "max_leaf_nodes", "random_state"), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start) self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes
def do_Extra_Tree(data, target): reviews_classifier_Extra_Tree = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ExtraTreeClassifier())]) parameters_Extra_Tree = {'tfidf__ngram_range': [(1, 1), (1, 2)]} gs_Extra_Tree = GridSearchCV(reviews_classifier_Extra_Tree, parameters_Extra_Tree, n_jobs=-1, verbose=1) gs_Extra_Tree = gs_Extra_Tree.fit(data, target) for parameters, mean_score, scores in gs_Extra_Tree.grid_scores_: print(parameters) print(mean_score) print(scores) print('=======================') result_file = open('result.txt', 'a') print("Best score: %f" % gs_Extra_Tree.best_score_, file=result_file) print("Best parameters: %r" % gs_Extra_Tree.best_params_, file=result_file) localtime = time.asctime(time.localtime(time.time())) print(localtime, file=result_file) result_file.close() return (gs_Extra_Tree.best_score_, gs_Extra_Tree.best_params_)
def ExtraTreeClassifier(n_jobs, class_weight): from sklearn.tree import ExtraTreeClassifier clf = ExtraTreeClassifier(criterion='gini', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=random_state, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=class_weight) return clf
def do_Adaboost(data, target): reviews_classifier_AdaBoost = Pipeline([('tfidf', TfidfVectorizer()), ('clf', AdaBoostClassifier())]) parameters_AdaBoost = { 'tfidf__ngram_range': [(1, 1), (1, 2)], 'clf__base_estimator': (ExtraTreeClassifier(), ), 'clf__algorithm': ('SAMME.R', ) } gs_AdaBoost = GridSearchCV(reviews_classifier_AdaBoost, parameters_AdaBoost, n_jobs=-1, verbose=1) gs_AdaBoost = gs_AdaBoost.fit(data, target) for parameters, mean_score, scores in gs_AdaBoost.grid_scores_: print(parameters) print(mean_score) print(scores) print('=======================') result_file = open('result.txt', 'a') print("Best score: %f" % gs_AdaBoost.best_score_, file=result_file) print("Best parameters: %r" % gs_AdaBoost.best_params_, file=result_file) localtime = time.asctime(time.localtime(time.time())) print(localtime, file=result_file) result_file.close() return (gs_AdaBoost.best_score_, gs_AdaBoost.best_params_)
def variables_relevantes_arbol(X, Y, alpha=None): if len(X) == 0: logger.info("No information was passed") return [] features = list(X.columns) if alpha == None: alpha = 1.0 / len(features) logger.info( 'Aceptance threshold for variable importance is calculated: {0}'. format(alpha)) try: model = ExtraTreeClassifier() model.fit(X, Y) importance = model.feature_importances_ relevant_features = [] for i in range(len(features)): if importance[i] > alpha: relevant_features.append(features[i]) except Exception as e: logger.info( 'Error with the tree based model, : There was not relevant variables found{0}' .format(e)) relevant_features = [] return importance, relevant_features
def wrapper(tr_data, tr_ans, ts_data, ts_ans): classifiers = [ SVC(), RandomForestClassifier(), LogisticRegression(), Perceptron(), ExtraTreeClassifier(), KNeighborsClassifier(), DecisionTreeClassifier() ] split_tr_data = split_features(tr_data) split_ts_data = split_features(ts_data) result_score = 0 result_clf = type(classifiers.__contains__) result_tr_data = np.zeros((tr_data.shape)) result_ts_data = np.zeros((ts_data.shape)) for tr, ts in zip(split_tr_data, split_ts_data): for s_tr, s_ts in zip(subset(tr), subset(ts)): for clf in classifiers: clf.fit(s_tr, tr_ans) pred_y = clf.predict(s_ts) temp_score = accuracy_score(ts_ans, pred_y) if result_score < temp_score: result_score = temp_score result_clf = clf result_tr_data = s_tr result_ts_data = s_ts print(result_score, result_clf) return result_tr_data, result_ts_data