def __init__(self, feature_length, num_classes, x=10): super().__init__(feature_length, num_classes) self.model = VotingClassifier(estimators=[ ('gba', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)), ('knn', KNeighborsClassifier(metric='manhattan', weights='distance', n_neighbors=3)), ('Nc', NearestCentroid(metric='manhattan')), ('nvb', GaussianNB()), ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')), ('svmlin', svm.SVC(kernel='linear')), ('svmpol', svm.SVC(kernel='poly')), ('svmrbf', svm.SVC(kernel='rbf')) ], voting='hard') self.num_classes = num_classes
y_train = X_train['interest_level'] X_train = X_train.drop('interest_level', axis=1) gbt = GradientBoostingClassifier(learning_rate=0.005, n_estimators=args.n, max_depth=4, random_state=2018) rf = RandomForestClassifier(1000, criterion='gini', n_jobs=-1, random_state=2018) lor = LogisticRegression(solver='newton-cg', multi_class='multinomial', max_iter=1000) clf = VotingClassifier([('gbt', gbt), ('rf', rf), ('lor', lor)], voting='soft', weights=[3, 2, 1], n_jobs=-1) if args.s: clf.fit(X_train, y_train) joblib.dump(clf, 'checkpoint/voting.pkl') X_test = pd.read_csv("data/test_python.csv", encoding='utf-8') pred = clf.predict_proba(X_test) np.savetxt('submission/submission.csv', np.c_[X_test['listing_id'], pred[:, [2, 1, 0]]], delimiter=',', header='listing_id,high,medium,low', fmt='%d,%.16f,%.16f,%.16f', comments='') else: cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)
BaggingClassifier(LogisticRegression(), random_state=13, n_estimators=3, max_features=0.5), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit") build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=5), "RandomForestAudit") build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba=False) build_audit( BaggingClassifier(RidgeClassifier(random_state=13), random_state=13, n_estimators=3, max_features=0.5), "RidgeEnsembleAudit") build_audit( VotingClassifier([("dt", DecisionTreeClassifier(random_state=13)), ("nb", GaussianNB()), ("lr", LogisticRegression())], voting="soft", weights=[3, 1, 2]), "VotingEnsembleAudit") build_audit(XGBClassifier(objective="binary:logistic"), "XGBAudit") versicolor_df = load_csv("Versicolor.csv") print(versicolor_df.dtypes) versicolor_columns = versicolor_df.columns.tolist() versicolor_mapper = DataFrameMapper([(versicolor_columns[:-1], [ContinuousDomain(), RobustScaler()]), (versicolor_columns[-1], None)]) versicolor = versicolor_mapper.fit_transform(versicolor_df)
class VotingEnsemble(BaseClassifier): def __init__(self, feature_length, num_classes, x=10): super().__init__(feature_length, num_classes) self.model = VotingClassifier(estimators=[ ('gba', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)), ('knn', KNeighborsClassifier(metric='manhattan', weights='distance', n_neighbors=3)), ('Nc', NearestCentroid(metric='manhattan')), ('nvb', GaussianNB()), ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')), ('svmlin', svm.SVC(kernel='linear')), ('svmpol', svm.SVC(kernel='poly')), ('svmrbf', svm.SVC(kernel='rbf')) ], voting='hard') self.num_classes = num_classes def train(self, features, labels): """ Using a set of features and labels, trains the classifier and returns the training accuracy. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to train to predict :return: Prediction accuracy, as a float between 0 and 1 """ labels = self.labels_to_categorical(labels) self.model.fit(features, labels) accuracy = self.model.score(features, labels) return accuracy # make sure you save model using the same library as we used in machine learning price-predictor def predict(self, features, labels): """ Using a set of features and labels, predicts the labels from the features, and returns the accuracy of predicted vs actual labels. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to test prediction accuracy on :return: Prediction accuracy, as a float between 0 and 1 """ label_train = self.labels_to_categorical(labels) labels = self.model.predict(features) accuracy = self.model.score(features, label_train) return accuracy def get_prediction(self, features): return self.model.predict(features) def reset(self): """ Resets the trained weights / parameters to initial state :return: """ pass def labels_to_categorical(self, labels): _, IDs = unique(labels, return_inverse=True) return IDs
# create the sub models estimators = [] model1 = LogisticRegression() estimators.append(('logistic', model1)) # model2 = DecisionTreeClassifier() # estimators.append(('cart', model2)) # model3 = SVC() # estimators.append(('svm', model3)) # model4 = tree.DecisionTreeClassifier() # estimators.append(('svm', model4)) # model5 = RandomForestClassifier(n_jobs=40) # estimators.append(('svm', model5)) # model6 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(1, 1), random_state=2) # estimators.append(('svm', model6)) # create the ensemble model ensemble = VotingClassifier(estimators) results = model_selection.cross_val_score(ensemble, training, trainingTag, cv=kfold) print(results.mean()) print results # #----- logistic regression Model ------ lg = LogisticRegression() lg = lg.fit(training, trainingTag) predictedValues0 = lg.predict(test) # # #----- DecisionTree Model ------ # DecisionTree = tree.DecisionTreeClassifier() # DecisionTree.fit(training,trainingTag)
missing = np.random.binomial(1, .1, size=X.shape) X[missing] = np.nan X = DataFrame(X, columns=['x%d' % i for i in range(n)]) return (dict(X=X, y=y), dict(X=X), dict(X=X)) def create_boston_housing(): X, y = load_boston(return_X_y=True) X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) return (dict(X=X, y=y), dict(X=X), dict(X=X)) test_cases = [ (VotingClassifier([('logistic', LogisticRegression()), ('earth', Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]))], 'hard', weights=[1.01, 1.01]), ['predict'], create_weird_classification_problem_1()), (GradientBoostingClassifier(max_depth=10, n_estimators=10), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (LogisticRegression(), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (IsotonicRegression(out_of_bounds='clip'), ['predict'], create_isotonic_regression_problem_1()), (Earth(), ['predict', 'transform'], create_regression_problem_1()), (Earth(allow_missing=True), ['predict', 'transform'], create_regression_problem_with_missingness_1()), (ElasticNet(), ['predict'], create_regression_problem_1()), (ElasticNetCV(), ['predict'], create_regression_problem_1()),
Perceptron(max_iter=100, random_state=rng)).fit(X_train, y_train) model_svc = SVC(probability=True, gamma='auto').fit(X_train, y_train) model_bayes = GaussianNB().fit(X_train, y_train) model_tree = DecisionTreeClassifier(random_state=rng).fit(X_train, y_train) model_knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train) pool_classifiers = [ model_perceptron, model_svc, model_bayes, model_tree, model_knn ] voting_classifiers = [("perceptron", model_perceptron), ("svc", model_svc), ("bayes", model_bayes), ("tree", model_tree), ("knn", model_knn)] model_voting = VotingClassifier(estimators=voting_classifiers).fit( X_train, y_train) # Initializing the DS techniques knorau = KNORAU(pool_classifiers) kne = KNORAE(pool_classifiers) desp = DESP(pool_classifiers) # DCS techniques ola = OLA(pool_classifiers) mcb = MCB(pool_classifiers) # Fitting the techniques knorau.fit(X_dsel, y_dsel) kne.fit(X_dsel, y_dsel) desp.fit(X_dsel, y_dsel) ola.fit(X_dsel, y_dsel) mcb.fit(X_dsel, y_dsel)
split3 = splits_cols[sz * 2:sz * 3] split4 = splits_cols[sz * 3:sz * 4] split5 = splits_cols[sz * 4:] # create a pipeline meta-classifier (sklearn) and use a linearsvc at the end as the classifier pipe1 = make_pipeline(ColumnSelector(cols=split1), LinearSVC()) pipe2 = make_pipeline(ColumnSelector(cols=split2), LinearSVC()) pipe3 = make_pipeline(ColumnSelector(cols=split3), LinearSVC()) pipe4 = make_pipeline(ColumnSelector(cols=split4), LinearSVC()) pipe5 = make_pipeline(ColumnSelector(cols=split5), LinearSVC()) # create the ensemble with the votingclassifier cls = VotingClassifier([ ('l1', pipe1), ('l2', pipe2), ('l3', pipe3), ('l4', pipe4), ('l5', pipe5), ], n_jobs=4) cls.fit(cars_train_X, cars_train_y) # uncomment the 3 lines below if needed to see the accuracy and std-dev of the training set # scores = cross_val_score(cls, cars_train_X, cars_train_y, cv=5, verbose=True) # print(scores) # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # this reaches about 30% acc # create the predictions and dump to a file for plotting the heatmap y_pred = cls.predict(cars_test_X) with open('5subset_linearsvm_voting.sav', 'wb') as f:
def _collect_probas(self, x): if (self._proba_cache is None) or (self._x_cache is None) or \ (not np.asarray(list(self._x_cache) == list(x)).all()): self._proba_cache = VotingClassifier._collect_probas(self, x) self._x_cache = x return self._proba_cache