def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def query_by_bagging(X, y, current_model, batch_size, rng, base_model=SVC(C=1, kernel='linear'), n_bags=5, method="KL", D=None): """ :param base_model: Model that will be **fitted every iteration** :param n_bags: Number of bags on which train n_bags models :param method: 'entropy' or 'KL' :return: """ assert method == 'entropy' or method == 'KL' eps = 0.0000001 if method == 'KL': assert hasattr(base_model, 'predict_proba'), "Model with probability prediction needs to be passed to this strategy!" clfs = BaggingClassifier(base_model, n_estimators=n_bags, random_state=rng) clfs.fit(X[y.known], y[y.known]) pc = clfs.predict_proba(X[np.invert(y.known)]) # Settles page 17 if method == 'entropy': pc += eps fitness = np.sum(pc * np.log(pc), axis=1) ids = np.argsort(fitness)[:batch_size] elif method == 'KL': p = np.array([clf.predict_proba(X[np.invert(y.known)]) for clf in clfs.estimators_]) fitness = np.mean(np.sum(p * np.log(p / pc), axis=2), axis=0) ids = np.argsort(fitness)[-batch_size:] return y.unknown_ids[ids], fitness/np.max(fitness)
class BaggingSK(PoolGenerator): ''' This class should not be used, use brew.generation.bagging.Bagging instead. ''' def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.sk_bagging.fit(X, y) self.ensemble.add_classifiers(self.sk_bagging.estimators_) #self.classes_ = set(y) def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ): print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###") print("baggedDecisionTree()\n") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree = BaggingClassifier( base_estimator = DecisionTreeClassifier(), n_estimators = nEstimators, # max_samples = X_train.shape[0], bootstrap = True, oob_score = True, n_jobs = -1 # use all available cores ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree.fit(X_train,y_train) y_pred = myBaggedDecisionTree.predict(X_test) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### print( "nEstimators: " + str(nEstimators) ) print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) ) print( "accuracy score: " + str(accuracy_score(y_test,y_pred)) ) print( "out-of-bag decision function:" ) print( str(myBaggedDecisionTree.oob_decision_function_) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
def bagging(X_train, X_test, y_train, y_test,n_est): n_est=51 estimators=range(1,n_est) decision_clf = DecisionTreeClassifier() for est in estimators: bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, bootstrap=True, random_state=9) bagging_clf.fit(X_train, y_train) # test line y_pred_bagging1 = bagging_clf.predict(X_test) score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1) scores1.append(score_bc_dt1) # train line y_pred_bagging2 = bagging_clf.predict(X_train) score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2) scores2.append(score_bc_dt2) plt.figure(figsize=(10, 6)) plt.title('Bagging Info') plt.xlabel('Estimators') plt.ylabel('Scores') plt.plot(estimators,scores1,'g',label='test line', linewidth=3) plt.plot(estimators,scores2,'c',label='train line', linewidth=3) plt.legend() plt.show()
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def test_bagging_sample_weight_unsupported_but_passed(): estimator = BaggingClassifier(DummyZeroEstimator()) rng = check_random_state(0) estimator.fit(iris.data, iris.target).predict(iris.data) assert_raises(ValueError, estimator.fit, iris.data, iris.target, sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert_equal(len(estimators_samples), len(estimators)) assert_equal(len(estimators_samples[0]), len(X) // 2) assert_equal(estimators_samples[0].dtype.kind, 'i') # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.coef_ estimator.fit(X_train, y_train) new_coefs = estimator.coef_ assert_array_almost_equal(orig_coefs, new_coefs)
class ADABoost(Base): def train(self, data = None, plugin=None): """ With dataframe train mllib """ super(ADABoost, self).train(data, plugin) #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True) X = self.X_train.iloc[:,:-1] Y = self.X_train.iloc[:,-1] self.scaler = StandardScaler().fit(X) X = self.scaler.transform(X) cl = SGDClassifier(loss='hinge') p = Pipeline([("Scaler", self.scaler), ("svm", cl)]) self.clf = BaggingClassifier(p, n_estimators=50) #self.clf = AdaBoostClassifier(p, n_estimators=10) #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10) self.clf.fit(X, Y) def predict(self, file, plugin=None): super(ADABoost, self).predict(file, plugin) data = file.vector X = data[plugin] X = self.scaler.transform(X) guess = self.clf.predict(X) return self.getTag(guess)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def train_classifiers(data): train_vars = [ 'X', 'Y', 'Darkness', 'Moon', 'Hour', 'DayOfWeekInt', 'Day', 'Month', 'Year', 'PdDistrictInt', 'TemperatureC', 'Precipitationmm', 'InPdDistrict', 'Conditions', 'AddressCode', ] weather_mapping = { 'Light Drizzle': 1, 'Drizzle': 2, 'Light Rain': 3, 'Rain': 4, 'Heavy Rain': 5, 'Thunderstorm': 6, } data.Precipitationmm = data.Precipitationmm.fillna(-1) data.Conditions = data.Conditions.map(weather_mapping).fillna(0) train, test = split(data) X_train = train[train_vars] y_train = train.CategoryInt X_test = test[train_vars] y_test = test.CategoryInt bdt_real_2 = AdaBoostClassifier( DecisionTreeClassifier(max_depth=8), n_estimators=10, learning_rate=1 ) #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1, #random_state=6065) bdt_real = BaggingClassifier(base_estimator=bdt_real_2, random_state=6065, n_estimators=100) #bdt_real = RandomForestClassifier(random_state=6065, #n_estimators=200) #bdt_real = ExtraTreesClassifier(random_state=6065, #min_samples_split=5, #n_estimators=200) bdt_real.fit(X_train, y_train) y_predict = pandas.Series(bdt_real.predict(X_test)) print len(y_predict[y_predict == y_test]) print len(y_predict) return bdt_real
def create_estimators(self, X_train, y_train, X_test): for model in self.models: param_grid = self.create_parameter_grid(model) for parameters in param_grid: clf = BaggingClassifier(base_estimator=model.set_params(**parameters), n_estimators=self.estimators, max_samples=0.95, n_jobs = 3) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test)[:,1] self.predictions.append(prediction)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert_equal(bagging._max_samples, max_samples)
def classification(self, x_train, y_train): ml = BaggingClassifier(DecisionTreeClassifier()) ml.fit(x_train, y_train) # print y_train[0] # print x_train[0] y_pred = ml.predict(x_train) print 'y_train ',y_train print 'y_pred ',y_pred.tolist()
def test_bagging_small_max_features(): # Check that Bagging estimator can accept low fractional max_features X = np.array([[1, 2], [3, 4]]) y = np.array([1, 0]) bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) bagging.fit(X, y)
def test_sparse_classification(): # Check classification for various parameter settings on sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set""" def fit(self, X, y): super(CustomSVC, self).fit(X, y) self.data_type_ = type(X) return self rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingClassifier( base_estimator=CustomSVC(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingClassifier( base_estimator=CustomSVC(), random_state=1, **params ).fit(X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types])
def train_and_test(X_train, X_test, y_train, y_test): forest = BaggingClassifier(n_estimators=500, random_state=1234) forest = forest.fit(X_train, y_train) proba = forest.predict_proba(X_test) proba = proba[:, 1] y_test = np.array(y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1) loss = metrics.auc(fpr, tpr) print loss return loss
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
class BaggingClassifier(BaseEstimator): def __init__(self, base_estimator=None, bag_kwargs=None): klass = dynamic_load(base_estimator['class']) svc = klass(**base_estimator['params']) self.__clf = SK_BaggingClassifier(base_estimator=svc, **bag_kwargs) def fit(self, X, y): return self.__clf.fit(X, y) def predict_proba(self, X): return self.__clf.predict_proba(X)
def predict_with_best_model(estimator, xtrain, ytrain, xtest): from sklearn.ensemble import BaggingClassifier model = BaggingClassifier(base_estimator=estimator, n_estimators=10, max_samples=0.9, max_features=0.9, n_jobs=1, bootstrap=False, bootstrap_features=False, oob_score=False) model = model.fit(xtrain,ytrain) y = model.predict_proba(xtest) # print("Bagging score with oob estimates: ") # print model.oob_score_ print ("Model used: ") print model.base_estimator_ return y
class BaggingLearner(AbstractLearner): def __init__(self): self.learner = BaggingClassifier(KNeighborsClassifier()) def _train(self, x_train, y_train): self.learner = self.learner.fit(x_train, y_train) def _predict(self, x): return self.learner.predict(x) def _predict_proba(self, x): return self.learner.predict_proba(x)
def phenotype_imputation(data, config): ''' Function to impute the labels on II based on the classifier learned on I. Parameters ---------- data : an object of class Dataset that contains: genotypes, covariates, labels and information about random folds config : an object of class ConfigState. It contains the user-entered parameters in a YAML format. See the config_file parameter in the main script for more details. ''' # Parameters for this task num_folds = data.num_folds task_name = "phenotype_imputation" n_estimators = config.get_entry(task_name, "n_estimators") romans_trn = config.get_entry(task_name, "romans_used_for_learning") romans_tst = config.get_entry(task_name, "romans_used_for_imputing") # Iterate through the folds: i = 0 size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0] soft_labels = np.zeros((size_of_two, num_folds)) X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose() fpr = dict() tpr = dict() thres = dict() roc_auc = np.zeros(num_folds) for fold in data.folds.transpose(): logging.info("Fold=%d" % (i + 1)) sel_trn = find_vec_entries_that_contain(fold,[romans_trn]) sel_tst = find_vec_entries_that_contain(fold,[romans_tst]) model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(), n_estimators=n_estimators, max_samples=0.632, # for small set I n_estimators=n_estimators, max_samples=0.8, max_features=5, bootstrap=True, bootstrap_features=True, oob_score=False, # for small set I bootstrap=False, bootstrap_features=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose()) soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1] fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) i+=1 # Save the output of this task config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
class BaggingDecisionTrees(object): def __init__(self, n_estimators): self.classifier = BaggingClassifier(n_estimators=n_estimators) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.classifier.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.classifier.predict(xs) return ys
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def main(): # The competition datafiles are in the directory /input # Read output csv format in case the file does not exists submit = pd.read_csv('sample_submission.csv') # Training cols print ("Loading training csv.") #train_cols = ['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster'] train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster'] train = pd.DataFrame(columns=train_cols) train_chunk = pd.read_csv('input/train.csv', chunksize=100000) print ("Training csv loaded.") # Read each chunk to train for chunk in train_chunk: #train = pd.concat( [ train, chunk ] ) train = pd.concat( [ train, chunk[chunk['is_booking']==1][train_cols] ] ) print ("Chunk done") # Load each column #x_train = train[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values x_train = train[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values y_train = train['hotel_cluster'].values # Run RandomForest on training data print ("Training RandomForest.") rf = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=4) bclf = BaggingClassifier(rf, n_estimators=2, n_jobs=4) bclf.fit(x_train, y_train) print ("Training done.") print ("Loading testing csv.") test_chunk = pd.read_csv('input/test.csv', chunksize=100000) print ("Begin testing each chunk.") predict = np.array([]) # Read each chunk to test for i, chunk in enumerate(test_chunk): #test_X = chunk[['site_name', 'posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 'orig_destination_distance', 'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']].values test_X = chunk[['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']].values test_X = np.nan_to_num(test_X) if i > 0: predict = np.concatenate( [predict, bclf.predict_proba(test_X)]) else: predict = bclf.predict_proba(test_X) print ("Chunk id: " + str(i)) submit['hotel_cluster'] = np.apply_along_axis(get5Best, 1, predict) submit.head() submit.to_csv('submission_random_forest.csv', index=False)
def run_bagging(training_set, train_set_labels, clsf,validation_set=None, validation_set_labels=None , facc=False): from sklearn.ensemble import BaggingClassifier bgc = BaggingClassifier(base_estimator=clsf, n_estimators=11, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0) # standard_train_inputs = standard_data(training_set) # standard_valid_inputs = standard_data(validation_set) fbgc = bgc.fit(training_set,train_set_labels.ravel()) if facc: acc = fbgc.score(validation_set,validation_set_labels.ravel()) print(acc) return acc else: return fbgc
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train, y_test, rands = None): """ Predict the lemons using a Bagging Classifier and a random seed both for the number of features, as well as for the size of the sample to train the data on ARGS: - x_train: :class:`pandas.DataFrame` of the x_training data - y_train: :class:`pandas.Series` of the y_training data - x_test: :class:`pandas.DataFrame` of the x_testing data - y_test: :class:`pandas.Series` of the y_testing data - rands: a :class:`tuple` of the (rs, rf) to seed the sample and features of the BaggingClassifier. If `None`, then rands are generated and provided in the return `Series` RETURNS: :class:`pandas.Series` of the f1-scores and random seeds """ #create a dictionary for the return values ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]} #use the randoms provided if there are any, otherwise generate them if not rands: rs = numpy.random.rand() rf = numpy.random.rand() while rf < 0.1: rf = numpy.random.rand() else: rs, rf = rands[0], rands[1] #place them into the dictionary ret_d['rs'], ret_d['rf'] = rs, rf #create and run the bagging classifier bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300, max_samples = rs, max_features = rf, n_jobs = 1) bc.fit(x_train, y_train) y_hat_train = bc.predict(x_train) ret_d['train-f1'] = f1_score(y_train, y_hat_train) y_hat_test = bc.predict(x_test) ret_d['test-f1'] = f1_score(y_test, y_hat_test) return pandas.Series(ret_d)
# validate_idx = list(range(num_sample))[int(num_sample*0.8):] x_test = data_test[fold] x_validate = data_train[fold][validate_idx] y_validate = label_train[validate_idx] w_validate = weight[validate_idx] x_train = data_train[fold][train_idx] y_train = label_train[train_idx] w_train = weight[train_idx] # train n_estimators = 50 clf = BaggingClassifier(SVC(**params), max_samples=1.0 / n_estimators, n_estimators=n_estimators, n_jobs=4) # clf = SVC(**params) clf.fit(x_train, y_train, sample_weight=w_train) # save with open('saved_models/SVC_{}/fold{}.pth'.format(timestr, fold), 'wb') as f: pickle.dump(clf, f) # evaluate # on train set jigsawevaluator_train = utils.JigsawEvaluator(y_train, y_identity[train_idx]) train_pred = clf.predict_proba(x_train, )
def test_run_and_upload(self): # This unit test is ment to test the following functions, using a varity of flows: # - openml.runs.run_task() # - openml.runs.OpenMLRun.publish() # - openml.runs.initialize_model() # - [implicitly] openml.setups.initialize_model() # - openml.runs.initialize_model_from_trace() task_id = 119 # diabates dataset num_test_instances = 253 # 33% holdout task num_folds = 1 # because of holdout num_iterations = 5 # for base search classifiers clfs = [] random_state_fixtures = [] lr = LogisticRegression() clfs.append(lr) random_state_fixtures.append('62501') pipeline1 = Pipeline(steps=[('scaler', StandardScaler( with_mean=False)), ('dummy', DummyClassifier(strategy='prior'))]) clfs.append(pipeline1) random_state_fixtures.append('62501') pipeline2 = Pipeline( steps=[('Imputer', Imputer( strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV(DecisionTreeClassifier(), { 'min_samples_split': [2**x for x in range(1, 7 + 1)], 'min_samples_leaf': [2**x for x in range(0, 6 + 1)] }, cv=3, n_iter=10))]) clfs.append(pipeline2) random_state_fixtures.append('62501') gridsearch = GridSearchCV( BaggingClassifier(base_estimator=SVC()), { "base_estimator__C": [0.01, 0.1, 10], "base_estimator__gamma": [0.01, 0.1, 10] }) clfs.append(gridsearch) random_state_fixtures.append('62501') randomsearch = RandomizedSearchCV( RandomForestClassifier(n_estimators=5), { "max_depth": [3, None], "max_features": [1, 2, 3, 4], "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] }, cv=StratifiedKFold(n_splits=2, shuffle=True), n_iter=num_iterations) clfs.append(randomsearch) # The random states for the RandomizedSearchCV is set after the # random state of the RandomForestClassifier is set, therefore, # it has a different value than the other examples before random_state_fixtures.append('12172') for clf, rsv in zip(clfs, random_state_fixtures): run = self._perform_run(task_id, num_test_instances, clf, random_state_value=rsv) # obtain accuracy scores using get_metric_score: accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score) # compare with the scores in user defined measures accuracy_scores_provided = [] for rep in run.fold_evaluations['predictive_accuracy'].keys(): for fold in run.fold_evaluations['predictive_accuracy'][ rep].keys(): accuracy_scores_provided.append( run.fold_evaluations['predictive_accuracy'][rep][fold]) self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores)) if isinstance(clf, BaseSearchCV): if isinstance(clf, GridSearchCV): grid_iterations = 1 for param in clf.param_grid: grid_iterations *= len(clf.param_grid[param]) self.assertEqual(len(run.trace_content), grid_iterations * num_folds) else: self.assertEqual(len(run.trace_content), num_iterations * num_folds) check_res = self._check_serialized_optimized_run(run.run_id) self.assertTrue(check_res) # todo: check if runtime is present self._check_fold_evaluations(run.fold_evaluations, 1, num_folds) pass
import pickle as pickle from eeg_sandbox import * import matplotlib.pyplot as plt # BaggedSVM parameters chosen from gridsearch (deprecated) C = 16 gamma = 0.0027472527472527475 # import X and y X_save_path = './X_filtered_non_standardized.pkl' X, y = load_X_and_y(X_save_path) X, y = shuffle_X_and_y(X, y) standardize_X(X) # give a list of starting features, perhaps found using some other feature selection method. # note this is optional - to use forward_selection alone simply omit the starting_features parameter # when calling forward_selection. starting_features = [30, 43, 49, 64, 108, 134, 159, 167, 200, 281, 299, 330] # create model model = BaggingClassifier(SVC(C=C, gamma=gamma), n_estimators=500, max_features=1.0) #model = SVC(C=C, gamma=gamma) # run algorithm and display selected features picked_features, errors = forward_selection( X, y, model, 10, max_error=True, k=4, starting_features=starting_features) print(picked_features)
def third_generation(X, y, size=200, seed=None): mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9], [0.1, 0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)] weighting_methods = ['uniform', 'distance'] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance']) ] C = np.logspace(-3, 7, num=11) degree = [2, 3, 4] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_params = list(itertools.product(['gini', 'entropy'], \ [1, 2, 3, 4, 5, None], \ [None, 'sqrt', 'log2'], \ ['best', 'random'])) dt_clf = [ DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] et_clf = [ ExtraTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] ada_params = list(itertools.product([2**i for i in range(1, 14)], \ [1, 2, 3])) ada_dt_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=DecisionTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_et_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=ExtraTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params] ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params] nb_bag_est = 50 nb_bag_stumps = 200 bag_dt = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=DecisionTreeClassifier()) bag_et = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=ExtraTreeClassifier()) bag_stumps = BaggingClassifier( n_estimators=nb_bag_stumps, base_estimator=DecisionTreeClassifier(max_depth=1)) bag_dt.fit(X, y) bag_et.fit(X, y) bag_stumps.fit(X, y) dt_bag_clf = bag_dt.estimators_ et_bag_clf = bag_et.estimators_ stump_bag_clf = bag_stumps.estimators_ dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] stump_bag_name = [ 'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps) ] bag_dt_clf = [bag_dt] bag_et_clf = [bag_dt] bag_stump_clf = [bag_stumps] bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))] bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))] bag_stump_name = ['bag_stump_{0}'.format(str(200))] nb_rf = 15 rf = RandomForestClassifier(n_estimators=nb_rf) rf.fit(X, y) dt_rf_clf = rf.estimators_ dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)] log_parameters = list(itertools.product(['l1', 'l2'],\ np.logspace(-5, 9, num=15), [True, False])) log_clf = [ LogisticRegression(penalty=l, C=c, fit_intercept=f) for (l, c, f) in log_parameters ] log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters] sgd_parameters = list( itertools.product([ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1))) sgd_clf = [ SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1) for (l, p, f, l1) in sgd_parameters ] sgd_name = [ 'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \ dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \ log_clf + sgd_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \ ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \ bag_stump_name + dt_rf_name + log_name + sgd_name for model in pool: if not check_model_is_fitted(model, X[0, :].reshape((1, -1))): model.fit(X, y) np.random.seed(seed) order = np.random.permutation(range(len(pool))) estimators = [pool[i] for i in order[:size]] return estimators, pool_name
print("Accuracy score for training samples", accuracy_score(y_train, dt.predict(X_train))) final_accuracy_scores_randomForest_gini.append([ dt, confusion_matrix(y_test, dt.predict(X_test)), accuracy_score(y_test, dt.predict(X_test)), confusion_matrix(y_train, dt.predict(X_train)), accuracy_score(y_train, dt.predict(X_train)) ]) from sklearn.model_selection import cross_val_score print("K-Fold results for machine learning model : {} ".format(dt)) print(cross_val_score(dt, X_train, y_train, cv=10)) predicted_randomForest_gini = dt.predict(X_test) predicted_randomForest_gini final_accuracy_scores_Bagging = [] dt = BaggingClassifier() dt.fit(X_train, y_train) dt.predict(X_train) dt.predict(X_test) print("") print( "---------------------------------------------------------------------------------------------------------" ) print("For the machine learning model : {}".format(i)) print("Confusion matrix for test samples") print(confusion_matrix(y_test, dt.predict(X_test))) print("Accuracy score for test samples", accuracy_score(y_test, dt.predict(X_test))) print("Confusion matrix for training samples") print(confusion_matrix(y_train, dt.predict(X_train))) print("Accuracy score for training samples",
# Computint prediction error cart_error = np.mean((ypred - y_test)**2) cart_verror = np.asarray([int(ypred[i] != y_test[i]) for i in range(0, ts)]) cart_error = np.sum(cart_verror) print("🌲 ----------Decision Tree Classfication----------") print(cart_error, "misclassified data out of", ts, "(", cart_error / ts, "%)\n") # print ("") '''-------------------- CART (Decision Tree) + Bagging http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html --------------------''' bagb = BaggingClassifier(dtc, n_estimators=30, bootstrap_features=True, bootstrap=bootstrap) # adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=20,learning_rate=1.5,algorithm="SAMME") bagb.fit(x_training, y_training) # Predicting bagb_pred = bagb.predict(x_test) # Finding mispredicted samples bagb_verror = np.asarray( [int(bagb_pred[i] != y_test[i]) for i in range(0, ts)]) bagb_error = np.sum(bagb_verror) bagb_ccidx = np.where(bagb_verror == 0) bagb_mcidx = np.where(bagb_verror == 1) print("🌲 ----------Decision Tree Classfication + Bagging----------")
y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import BaggingClassifier tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=None) bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) # Run Decision Tree to see the accuracy of the classifier from sklearn.metrics import accuracy_score tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) # Run Bagging classifier
def get_data(file_path): with open(file_path) as f: df = a2p.load(f) df = df.interpolate() input_features = df.drop(["defects@{false,true}"], axis=1) output_class = np.where(df["defects@{false,true}"] == 'true', 1, 0) return np.array(input_features), np.array(output_class) return X, y = get_data('../cm1.arff') es = BaggingClassifier(base_estimator= Perceptron(max_iter=1000, class_weight = 'balanced'), n_estimators=100, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=4) new_result = lambda : {'accuracy':[], 'roc_auc': [], 'gmean': [], 'f1':[]} results = new_result() k_neigh = 7 threshold = 0.4 #20 repetitions for rep in range(1,6): skf = StratifiedKFold(n_splits=4) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
def train(): # if os.path.exists('dataset/per_feature_matrix'): # per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb')) # else: start = time.time() print "extracting feature matrix..." if 1: per_feature_matrix = {} for each in os.listdir('dataset/per_feature'): path = os.path.join('dataset/per_feature/', each) per_feature_matrix = dict(pickle.load(open(path, 'rb')), **per_feature_matrix) per_feature_matrix = per_feature_matrix.values() pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix = {} for each in os.listdir('dataset/api_feature'): path = os.path.join('dataset/api_feature/', each) api_feature_matrix = dict(pickle.load(open(path, 'rb')), **api_feature_matrix) api_feature_matrix = api_feature_matrix.values() pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix = {} for each in os.listdir('dataset/ngram_feature'): path = os.path.join('dataset/ngram_feature/', each) ngram_feature_matrix = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix) ngram_feature_matrix = ngram_feature_matrix.values() pickle.dump(ngram_feature_matrix, open('dataset/ngram_feature_matrix', 'wb')) classification = pickle.load(open('dataset/classification', 'rb')) if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None: feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix, ngram_feature_matrix) elif per_feature_matrix is not None: feature_matrix = per_feature_matrix elif api_feature_matrix is not None: feature_matrix = api_feature_matrix elif ngram_feature_matrix is not None: feature_matrix = ngram_feature_matrix else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix) #print len(feature_matrix) #print len(classification) features = 400 fsmodel = SelectKBest(chi2, k=features) raw_feature_matrix = feature_matrix feature_matrix = fsmodel.fit_transform(feature_matrix, classification) pickle.dump(fsmodel, open('dataset/fsmodel', 'wb')) features = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1) ######################### DEBUG ############################ #classification = classification[7:] ################################################################## feature_matrix = fsmodel2.fit_transform(feature_matrix, classification) pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb')) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix) feature_matrix = feature_matrix[length % b_s:] raw_feature_matrix = raw_feature_matrix[length % b_s:] classification = classification[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix) ################################################################## ######################### DEBUG ############################ fs_vec = [] for i in range(len(raw_feature_matrix[0])): fs_vec.append(i) #构造值等于编号的特殊向量 fs_vec = fsmodel.transform(fs_vec) #print fs_vec fs_vec = fsmodel2.transform(fs_vec) #print fs_vec feature_matrix_dl = [x for x in range(len(raw_feature_matrix))] for i in range(len(feature_matrix_dl)): feature_matrix_dl[i] = [ x for x in range(len(raw_feature_matrix[0]) - features) ] temp = 0 for i in range(len(raw_feature_matrix[0])): if i not in fs_vec: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl)): feature_matrix_dl[j][temp] = raw_feature_matrix[j][i] temp = temp + 1 #print "行数%d" % len(feature_matrix_dl) #print "列数%d" % len(feature_matrix_dl[0]) #print feature_matrix_dl ################################################################## #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0])) # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da) hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny[0]) # print (hiddeny == hiddeny2).all() #固化深度训练器 pickle.dump(da, open('dataset/rbmmodel', 'wb')) # 深度特征融合 feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1) Z = [] count = 0 for i in feature_matrix: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix = Z # print feature_matrix Z = [] for i in classification: Z.append(int(i)) classification = Z if 1: per_feature_matrix2 = {} for each in os.listdir('test/per_feature'): path = os.path.join('test/per_feature/', each) per_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **per_feature_matrix2) per_feature_matrix2 = per_feature_matrix2.values() pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix2 = {} for each in os.listdir('test/api_feature'): path = os.path.join('test/api_feature/', each) api_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **api_feature_matrix2) api_feature_matrix2 = api_feature_matrix2.values() pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix2 = {} for each in os.listdir('test/ngram_feature'): path = os.path.join('test/ngram_feature/', each) ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix2) ngram_feature_matrix2 = ngram_feature_matrix2.values() pickle.dump(ngram_feature_matrix2, open('test/ngram_feature_matrix', 'wb')) classification2 = pickle.load(open('test/classification', 'rb')) if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None: feature_matrix2 = _concatenate(per_feature_matrix2, api_feature_matrix2, ngram_feature_matrix2) elif per_feature_matrix2 is not None: feature_matrix2 = per_feature_matrix2 elif api_feature_matrix2 is not None: feature_matrix2 = api_feature_matrix2 elif ngram_feature_matrix2 is not None: feature_matrix2 = ngram_feature_matrix2 else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix2) #print len(feature_matrix) #print len(classification) features = 400 fsmodel2 = SelectKBest(chi2, k=features) raw_feature_matrix2 = feature_matrix2 feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2) features2 = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1) feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix2) feature_matrix2 = feature_matrix2[length % b_s:] raw_feature_matrix2 = raw_feature_matrix2[length % b_s:] classification2 = classification2[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix2) ################################################################## ######################### DEBUG ############################ fs_vec2 = [] for i in range(len(raw_feature_matrix2[0])): fs_vec2.append(i) #构造值等于编号的特殊向量 fs_vec2 = fsmodel.transform(fs_vec2) #print fs_vec fs_vec2 = fsmodel2.transform(fs_vec2) #print fs_vec feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))] for i in range(len(feature_matrix_dl2)): feature_matrix_dl2[i] = [ x for x in range(len(raw_feature_matrix2[0]) - features2) ] temp = 0 for i in range(len(raw_feature_matrix2[0])): if i not in fs_vec2: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl2)): feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i] temp = temp + 1 hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix2[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny2[0]) # print (hiddeny == hiddeny2).all() # 深度特征融合 feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1) Z = [] count = 0 for i in feature_matrix2: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix2 = Z # print feature_matrix Z = [] for i in classification2: Z.append(int(i)) classification2 = Z ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with SVC..." slffork=SVC(kernel='rbf',probability = True) slffork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with SVC done.\n" pickle.dump(slffork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' print "learning with BaggingClassifier..." kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5) baggingfork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with BaggingClassifier done.\n" pickle.dump(baggingfork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)''' rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) ada = AdaBoostClassifier(n_estimators=300) #slf1=SVC(kernel='rbf',probability = True) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) print "learning with Voting Classifier..." vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada), ('bagging', bagging), ('gbdt', gbdt)], voting='soft', weights=[1.5, 1.5, 1.3, 1.5]) vc.fit(feature_matrix, classification) ''' print "Cross Validating..." predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) ''' print "learning with Ensemble Classifier done.\n" pickle.dump(vc, open('dataset/model_final', 'wb')) # 固化训练结果 print 'time :%f' % (time.time() - start)
classReport(y_train, y_train_pred4, y_pred4, y_test) #Cross fit metrics scores4 = cross_val_score(clf4, features, target, cv=cv) mean_score(scores4) print_score(scores4) ###################################################################################### #using bagging print('\nResult of Bagging Classifier') #classifier and fit #clf5 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5) clf5 = Pipeline([('scaler', StandardScaler()), ('', BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5, max_features=1.0, n_estimators=20))]) clf5 = clf5.fit(X_train, y_train) #prediction y_train_pred5 = clf5.predict(X_train) y_pred5 = clf5.predict(X_test) #metrics creation classReport(y_train, y_train_pred5, y_pred5, y_test) #Cross fit metrics scores5 = cross_val_score(clf5, features, target, cv=cv) mean_score(scores5) print_score(scores5)
acc11 = accuracy_score(y_test, final_pred1, normalize=True) acc12 = accuracy_score(y_test, final_pred2) acc13 = accuracy_score(y_test, final_pred3) acc14 = accuracy_score(y_test, final_pred6) acc15 = accuracy_score(y_test, final_pred7) acc16 = accuracy_score(y_test, final_pred8) acc17 = accuracy_score(y_test, final_pred9) #Pre6=metrics.precision_score(y_test,pred6,average=None) model = AdaBoostClassifier(random_state=1) model.fit(x_train, y_train) ac = model.score(x_test, y_test) model12 = BaggingClassifier(DecisionTreeClassifier(random_state=1)) model12.fit(x_train, y_train) ac12 = model12.score(x_test, y_test) model13 = xgb.XGBClassifier(random_state=1, learning_rate=0.01) #model13.fit(x_train,y_train) #acc13=model13.score(x_test,y_test) print("\n\n") print("Random Forest :", end="") print(acc1) print("Kneighbors :", end="") print(acc2) print("SVM :", end="") print(acc3) print("Linear Regression :", end="")
def bagging(train_x, train_y, test_x, test_y, msno_df): print ("Bagging") clf = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), max_samples=0.9, n_estimators=30, bootstrap=False) checkResult(clf, "Bagging", train_x, train_y, test_x, test_y, msno_df)
clf = AdaBoostClassifier() clf.fit(x_train, y_train) print("AdaBoost classifier") print(clf.score(x_test, y_test)) print("\n") #Bagging Classifier x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) clf = BaggingClassifier() clf.fit(x_train, y_train) print("Bagging classifier") print(clf.score(x_test, y_test)) print("\n") #ExtraTrees Classifier x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) clf = ExtraTreesClassifier() clf.fit(x_train, y_train)
print('The cross validated score is',cross.mean()) # ## Bagging # # Bagging is a general ensemble method. It works by applying similar classifiers on small partitions of the dataset and then taking the average of all the predictions. Due to the averaging,there is reduction in variance. Unlike Voting Classifier, Bagging makes use of similar classifiers. # # #### Bagged KNN # # Bagging works best with models with high variance. An example for this can be Decision Tree or Random Forests. We can use KNN with small value of **n_neighbours**, as small value of n_neighbours. # In[ ]: from sklearn.ensemble import BaggingClassifier model=BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),random_state=0,n_estimators=700) model.fit(train_X,train_Y) prediction=model.predict(test_X) print('The accuracy for bagged KNN is:',metrics.accuracy_score(prediction,test_Y)) result=cross_val_score(model,X,Y,cv=10,scoring='accuracy') print('The cross validated score for bagged KNN is:',result.mean()) # #### Bagged DecisionTree # # In[ ]: model=BaggingClassifier(base_estimator=DecisionTreeClassifier(),random_state=0,n_estimators=100) model.fit(train_X,train_Y)
GradientBoostingClassifier(), dict(learning_rate=[0.001, 0.01, 0.1], n_estimators=[10, 100, 1000], subsample=[0.5, 0.7, 1.0], max_depth=[3, 7, 9]) ], 'RidgeClassifier': [ RidgeClassifier(), dict(alpha=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) ], 'LogisticRegression': [ LogisticRegression(), dict(solver=['newton-cg', 'lbfgs', 'liblinear'], penalty=['l2'], C=[100, 10, 1.0, 0.1, 0.01]) ], 'BaggingClassifier': [BaggingClassifier(), dict(n_estimators=[10, 100, 1000])], 'ANN': [ MLPClassifier(), dict(activation=['identity', 'logistic', 'tanh', 'relu'], hidden_layer_sizes=[(i, j) for i in range(1, 101, 10) for j in range(1, 101, 10)]) ] } which_algorithm_to_run = [ 'random forest', 'SVM', 'GB', 'RidgeClassifier', 'LogisticRegression', 'BaggingClassifier', 'ANN' ]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) knn_class = KNeighborsClassifier().fit(X_train, y_train) knn_pred = knn_class.predict(X_test) print('=' * 64) print('Accuracy with only KNeighborsClassifier:', accuracy_score(knn_pred, y_test)) #bag_class = BaggingClassifier(base_estimator=KNeighborsClassifier(), # n_estimators=50).fit(X_train, y_train) #bag_pred = bag_class.predict(X_test) #print(accuracy_score(bag_pred, y_test)) #print('='*64) classifier = { 'KNeighbors': KNeighborsClassifier(), 'LogisticRegression': LogisticRegression(), 'LinearSCV': LinearSVC(), 'SVC': SVC(), 'SGDC': SGDClassifier(), 'DecisionTree': DecisionTreeClassifier(), 'RandomTreeForest': RandomForestClassifier(random_state=0) } for name, estimator in classifier.items(): bag_class = BaggingClassifier(base_estimator=estimator, n_estimators=30).fit(X_train, y_train) bag_pred = bag_class.predict(X_test) print(f'Accuracy Bagging with {name}:', accuracy_score(bag_pred, y_test))
svmendzeit = timeit.default_timer() print("SVM zeit: ", svmendzeit - svmzeit) print("Linear :", "Score 1: ", score_linear1, "Score 2: ", score_linear2, "Poly : ", "Score 1: ", score_poly1, "Score 2: ", score_poly2, "Score 1: ", score_gauss1, "Score 2 : ", score_gauss2) print(" ") #""" # In[7] """Bagging """ #""" startBag = timeit.default_timer() model_linear = BaggingClassifier(base_estimator=svm_linear, n_estimators=50, random_state=1) score_linear_bag1 = model_linear.fit(X1_train, y1_train).score(X1_test, y1_test) score_linear_bag2 = model_linear.fit(X2_train, y2_train).score(X2_test, y2_test) bag1 = timeit.default_timer() print("linear bag score zeit : ", startBag - bag1) model_poly = BaggingClassifier(base_estimator=svm_poly, n_estimators=50, random_state=1) score_poly_bag1 = model_poly.fit(X1_train, y1_train).score(X1_test, y1_test) score_poly_bag2 = model_poly.fit(X2_train, y2_train).score(X2_test, y2_test) bag2 = timeit.default_timer() print("poly bag score zeit", bag2 - bag1)
print(testData.head()) # Initialize the DSBox Encoder hp = EncHyperparameter(text2int=True, n_limit=12, categorical_features='95in10') enc = Encoder(hyperparams=hp) enc.set_training_data(inputs=trainData) enc.fit() print(type(enc.get_params())) print(enc.get_params()) imputer = Imputer() model = BaggingClassifier() print(trainData.columns) encodedTrainData = enc.produce(inputs=trainData).value processedTrainData = imputer.fit_transform(encodedTrainData) trainedModel = model.fit(processedTrainData, np.asarray(trainTargets[target_name])) print(encodedTrainData.columns) # encoded result predictedTargets = trainedModel.predict( imputer.fit_transform(enc.produce(inputs=testData).value)) # Append the d3mindex column to the predicted targets predictedTargets = pd.DataFrame({
def test_base(): """Check BaseEnsemble methods.""" ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3) iris = load_iris() ensemble.fit(iris.data, iris.target) ensemble.estimators_ = [] # empty the list and create estimators manually ensemble._make_estimator() ensemble._make_estimator() ensemble._make_estimator() ensemble._make_estimator(append=False) assert_equal(3, len(ensemble)) assert_equal(3, len(ensemble.estimators_)) assert_true(isinstance(ensemble[0], Perceptron))
import kfold_template import pandas # from sklearn import tree from sklearn.ensemble import BaggingClassifier dataset = pandas.read_csv("dataset.csv") target = dataset.iloc[:, 30].values data = dataset.iloc[:, 0:30].values # print(target) # print(data) machine = BaggingClassifier(n_estimators=21) r2_scores, accuracy_scores, confusion_matrices = kfold_template.run_kfold( 3, data, target, machine, 1, 1) print(r2_scores) print(accuracy_scores) for i in confusion_matrices: print(i)
"--size", default=1000, help="config the data set size", action="store", type="int", dest="size") (options, args) = parser.parse_args() if options.size != DATASET_SIZE: dataset_size = options.size else: dataset_size = DATASET_SIZE bag_clf = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) ada_clf = AdaBoostClassifier(n_estimators=5) rdf_clf = RandomForestClassifier(n_estimators=5, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=True, oob_score=False,
from sklearn.metrics import mean_absolute_error from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import RandomForestClassifier from vecstack import stacking from sklearn.metrics import precision_recall_fscore_support, accuracy_score from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.model_selection import cross_val_score from drop_highlycorelated import clf,xtrain,ytrain,xtest,ytest,X_important_train,X_important_test models = [ svm.SVC(kernel='linear',C=1), RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1000, max_depth=3), BaggingClassifier(svm.SVC(kernel='linear',C=1)) ] S_train, S_test = stacking(models, # list of models X_important_train, ytrain, X_important_test, # data, # regression task (if you need # classification - set to False) mode='oof_pred_bag', # mode: oof for train set, predict test regression=True, # set in each fold and find mean save_dir=None, # do not save result and log (to save # in current dir - set to '.') metric=mean_absolute_error, # metric: callable n_folds=4, # number of folds shuffle=True, # shuffle the data random_state=0, # ensure reproducibility verbose=2)
def second_generation(X, y, seed=None): features = [] ### 25 x 2 bagged trees bag_gini = BaggingClassifier( base_estimator=DecisionTreeClassifier(criterion='gini'), n_estimators=25, random_state=seed) bag_gini.fit(X, y) bag_gini_names = ['bag_gini_' + str(i) for i in range(25)] features.extend( [np.arange(X.shape[1]) for _ in range(len(bag_gini_names))]) bag_entropy = BaggingClassifier( base_estimator=DecisionTreeClassifier(criterion='entropy'), n_estimators=25, random_state=3 * seed**2) bag_entropy.fit(X, y) bag_entropy_names = ['bag_entropy_' + str(i) for i in range(25)] features.extend( [np.arange(X.shape[1]) for _ in range(len(bag_entropy_names))]) ### 25 x 2 random subspaces rs_gini = BaggingClassifier( base_estimator=DecisionTreeClassifier(criterion='gini'), n_estimators=25, max_features=int(np.sqrt(X.shape[1])), bootstrap=False, random_state=seed) rs_gini.fit(X, y) rs_gini_names = ['rs_gini_' + str(i) for i in range(25)] features.extend(rs_gini.estimators_features_) rs_entropy = BaggingClassifier( base_estimator=DecisionTreeClassifier(criterion='entropy'), n_estimators=25, max_features=int(np.sqrt(X.shape[1])), bootstrap=False, random_state=3 * seed**2) rs_entropy.fit(X, y) rs_entropy_names = ['rs_entropy_' + str(i) for i in range(25)] features.extend(rs_entropy.estimators_features_) ### 14 Ada nb_stumps = [2, 4, 8, 16, 32, 64, 128] ada_st_gini = [ AdaBoostClassifier(base_estimator=DecisionTreeClassifier( criterion='gini', max_depth=1), n_estimators=st, random_state=seed) for st in nb_stumps ] ada_st_gini_names = ['ada_st_gini_' + str(i) for i in nb_stumps] features.extend( [np.arange(X.shape[1]) for _ in range(len(ada_st_gini_names))]) for clf in ada_st_gini: clf.fit(X, y) ada_st_entropy = [ AdaBoostClassifier(base_estimator=DecisionTreeClassifier( criterion='entropy', max_depth=1), n_estimators=st, random_state=3 * seed**2) for st in nb_stumps ] ada_st_entropy_names = ['ada_st_entropy_' + str(i) for i in nb_stumps] features.extend( [np.arange(X.shape[1]) for _ in range(len(ada_st_entropy_names))]) for clf in ada_st_entropy: clf.fit(X, y) ### 8 Ada DT nb_dt = [2, 4, 8, 16] ada_dt_gini = [ AdaBoostClassifier(base_estimator=DecisionTreeClassifier( criterion='gini', max_depth=3), n_estimators=dt, random_state=seed) for dt in nb_dt ] ada_dt_gini_names = ['ada_dt_gini_' + str(i) for i in nb_dt] features.extend( [np.arange(X.shape[1]) for _ in range(len(ada_dt_gini_names))]) for clf in ada_dt_gini: clf.fit(X, y) ada_dt_entropy = [ AdaBoostClassifier(base_estimator=DecisionTreeClassifier( criterion='entropy', max_depth=3), n_estimators=st, random_state=3 * seed**2) for dt in nb_dt ] ada_dt_entropy_names = ['ada_dt_entropy_' + str(i) for i in nb_dt] features.extend( [np.arange(X.shape[1]) for _ in range(len(ada_dt_entropy_names))]) for clf in ada_dt_entropy: clf.fit(X, y) ### 24 ANN mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m) for (h, m) in mlp_parameters ] for clf in mlp_clf: clf.fit(X, y) mlp_name = ['mlp_{0}_{1}'.format(*param) for param in mlp_parameters] features.extend([np.arange(X.shape[1]) for _ in range(len(mlp_name))]) ### 54 SVM C = np.logspace(-3, 2, num=6) gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_linear = [SVC(C=c, kernel='poly', degree=1) for c in C] for clf in svm_linear: clf.fit(X, y) svm_linear_names = ['svm_linear_' + str(c) for c in C] features.extend( [np.arange(X.shape[1]) for _ in range(len(svm_linear_names))]) svm_rbf = [SVC(C=c, gamma=g) for c, g in itertools.product(C, gamma)] for clf in svm_rbf: clf.fit(X, y) svm_rbf_names = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] features.extend([np.arange(X.shape[1]) for _ in range(len(svm_rbf_names))]) pool = bag_gini.estimators_ + bag_entropy.estimators_ + rs_gini.estimators_ + rs_entropy.estimators_ + \ ada_st_gini + ada_st_entropy + ada_dt_gini + ada_dt_entropy + mlp_clf + svm_linear + svm_rbf pool_name = bag_gini_names + bag_entropy_names + rs_gini_names + rs_entropy_names + ada_st_gini_names + \ ada_st_entropy_names + ada_dt_gini_names + ada_dt_entropy_names + mlp_name + svm_linear_names + \ svm_rbf_names return pool, pool_name, features
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=.30,random_state=355) # In[46]: from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import BaggingClassifier # In[49]: bag_decision = BaggingClassifier(DecisionTreeClassifier()) # In[50]: bag_decision.fit(X_train, y_train) # In[51]: bag_decision.score(X_test, y_test) # In[52]:
column_names = ["Sample Code Number","Clump Thickness","Uniformity of Cell Size"\ ,"Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size"\ ,"Bare Nuclei","Bland Chromatin","Normal Nuclei","Mitoses","Class Label"] cancer_data.columns = column_names del cancer_data["Sample Code Number"] cancer_data = cancer_data.iloc[np.random.permutation(len(cancer_data))] class_labels = cancer_data["Class Label"] del cancer_data["Class Label"] train_data,test_data,train_labels,test_labels = cross_validation.train_test_split(cancer_data,class_labels,test_size=0.3) #Initializing classifiers rf = RandomForestClassifier(n_estimators=101) ada = AdaBoostClassifier(n_estimators=101) bagging = BaggingClassifier(n_estimators=101) grad_boost = GradientBoostingClassifier(n_estimators=101) mnb = MultinomialNB() gnb = GaussianNB() bnb = BernoulliNB() brm = BernoulliRBM() percept = Perceptron() svm = SVC() knn = KNeighborsClassifier(n_neighbors=5) radnn = RadiusNeighborsClassifier(radius=10.3) classifiers = [rf,ada,bagging,grad_boost,mnb,gnb,bnb,percept,svm,knn,radnn] classifier_names = ["Random Forests","Adaboost","Bagging","Gradient Boost","Multinomial NB"\ ,"Gaussian NB","Bernoulli NB","Perceptron","SVM (RBF)","KNN (K=5)","RadiusNN(r=10.3)"] for classifier,classifier_name in zip(classifiers,classifier_names):
In sklearn, you can evaluate the OOB accuracy of an ensemble classifier by setting the parameter oob_score to True during instantiation. After training the classifier, the OOB accuracy can be obtained by accessing the .oob_score_ attribute from the corresponding instance. In your environment, we have made available the class DecisionTreeClassifier from sklearn.tree. Instructions 100 XP Import BaggingClassifier from sklearn.ensemble. Instantiate a DecisionTreeClassifier with min_samples_leaf set to 8. Instantiate a BaggingClassifier consisting of 50 trees and set oob_score to True.""" # Import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier # Import BaggingClassifier from sklearn.ensemble import BaggingClassifier # Instantiate dt dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1) # Instantiate bc bc = BaggingClassifier(base_estimator=dt, n_estimators=50, oob_score=True, random_state=1) """DEVELOPER""" """BasitAminBhatti""" """Github""" """https://github.com/basitaminbhatti"""
from sklearn.model_selection import train_test_split, GridSearchCV #accuracy = [] #for i in range(200): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90) # clf = tree.DecisionTreeClassifier() # clf.fit(X_train, y_train) # Z = clf.predict(X_test) # accuracy.append(clf.score(X_test,y_test)) #print(np.mean(accuracy)) #print(np.std(accuracy)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90) clf = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_estimators=200) clf.fit(X_train, y_train) Z = clf.predict(X_test) print(clf.score(X_test, y_test)) #plotAccuracyBagging(X, y, nbLoop=201) #arbre = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=200) #parameters = {'max_samples' : np.random.uniform(0,1,10), 'max_features' : np.random.uniform(0,1,10)} #clf = GridSearchCV(arbre, parameters, cv=5) #clf.fit(X_train, y_train) #print(clf.best_params_) clf = RandomForestClassifier(n_estimators=200) clf.fit(X_train, y_train)
x = dataset.values[:, 0:10] y = dataset.values[:, 10] x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, train_size=0.7, test_size=0.3, random_state=0, stratify=y) models = [('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)), ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()), ('Support Vector Machine', SVC(random_state=0, gamma='scale', probability=True)), ('Naive-Bayes Classifier', GaussianNB()), ('K-Nearest Neighborhood', KNeighborsClassifier(n_neighbors=11)), ('Basic Decision Tree', DecisionTreeClassifier(random_state=0)), ('Bagged Tree', BaggingClassifier(random_state=0)), ('Boosted Tree', GradientBoostingClassifier(random_state=0)), ('Random Forest', RandomForestClassifier(random_state=0, n_estimators=100))] accuracy_list = [] cv_score_list = [] roc_auc_list = [] i = 0 print( "Model Name Accuracy CV Score ROC-AUC Score" ) print( "---------- -------- -------- -------------" )
class _BaggingClassifierImpl: def __init__( self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0, ): estimator_impl = base_estimator self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "max_samples": max_samples, "max_features": max_features, "bootstrap": bootstrap, "bootstrap_features": bootstrap_features, "oob_score": oob_score, "warm_start": warm_start, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y, sample_weight=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = ( feature_transformer >> self._hyperparams["base_estimator"] ) self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.fit(X, y, sample_weight) return self def predict(self, X, **predict_params): return self._wrapped_model.predict(X, **predict_params) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def predict_log_proba(self, X): return self._wrapped_model.predict_log_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
predict_test = estimator.predict(test) #*** checking results print('the in-bag prediction accuracy rate is:', (y_data == predict_data).sum() / float(y_data.shape[0])) #*** output the result print('save the result in test_result.csv file.......') output = pd.read_csv('sample_submission.csv') output.loc[:, 'Label'] = predict_test output.to_csv('test_result.csv', index=False) total.loc[:, 'knn'] = predict_test #*************** bagging model: #different types of pipline, with pca or not bagging_pip = Pipeline([ ('scaler', ScalingByRange()), ('pca', PCA()), ('bagging', BaggingClassifier(SVC(C=9, gamma=0.04, kernel='rbf'))) ]) #max_depth=10, # bagging_cv = True bagging_cv = False # bagging_cv = None if bagging_cv is None: print('we skip the bagging model training at this time.......') elif bagging_cv: #************ select partial of train to improve speed # train = train.iloc[0:5000,:] # y_train = y_train.iloc[0:5000] #************ test the parameter setting manually here