def trees(x_train, x_test, y_train, y_test): res = [] m = tree.DecisionTreeRegressor() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = mean_squared_error(y_test, predictions) modelPack['DecisionTreeRegressor'] = m res.append((acc, "DecisionTreeRegressor")) m = tree.ExtraTreeRegressor() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = mean_squared_error(y_test, predictions) modelPack['ExtraTreeRegressor'] = m res.append((acc, "ExtraTreeRegressor")) print(res) return res
def get_xtr(): """An extremely randomized tree regressor. * criterion: {“mse”, “friedman_mse”, “mae”}, default=”mse” The function to measure the quality of a split. * splitter: {“random”, “best”}, default=”random” The strategy used to choose the split at each node. * max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. * min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node. * min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. * min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. * max_features: int, float, {“auto”, “sqrt”, “log2”} or None, default=”auto” The number of features to consider when looking for the best split. """ return tree.ExtraTreeRegressor( criterion="mse", splitter="random", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=rnd_state, )
def test_sk_ExtraTreeRegressor(): print("Testing sklearn, ExtraTreeRegressor...") mod = tree.ExtraTreeRegressor() X, y = iris_data mod.fit(X, y) docs = {'name': "ExtraTreeRegressor test"} fv = X[0, :] upload(mod, fv, docs)
def test_regression_toy(): """Check regression on a toy dataset.""" # Decision trees clf = tree.DecisionTreeRegressor() clf.fit(X, y) assert_almost_equal(clf.predict(T), true_result) clf = tree.DecisionTreeRegressor(max_features=1, random_state=1) clf.fit(X, y) assert_almost_equal(clf.predict(T), true_result) # Extra-trees clf = tree.ExtraTreeRegressor() clf.fit(X, y) assert_almost_equal(clf.predict(T), true_result) clf = tree.ExtraTreeRegressor(max_features=1, random_state=1) clf.fit(X, y) assert_almost_equal(clf.predict(T), true_result)
def shotgun_models(x, y): kernel = gaussian_process.kernels.DotProduct( ) + gaussian_process.kernels.WhiteKernel() models = [ gaussian_process.GaussianProcessRegressor(kernel=kernel, random_state=1337).fit( x, y), linear_model.LinearRegression(n_jobs=2).fit(x, y), tree.DecisionTreeClassifier().fit(x, y), tree.DecisionTreeRegressor().fit(x, y), tree.ExtraTreeRegressor().fit(x, y), naive_bayes.GaussianNB().fit(x, y), neural_network.MLPRegressor(hidden_layer_sizes=(10, ), activation='relu', solver='adam', alpha=0.001, batch_size='auto', learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True, random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08).fit(x, y), linear_model.Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False).fit(x, y), linear_model.ElasticNet().fit(x, y), linear_model.SGDRegressor().fit(x, y), linear_model.Ridge().fit(x, y), linear_model.PassiveAggressiveRegressor().fit(x, y) ] return models
def default_models_(self): return { 'Tree': {'clf': tree.DecisionTreeRegressor(), 'param': {'max_depth': [3, 5, 7, 10, 20] }}, 'GBDT': {'clf': ensemble.GradientBoostingRegressor(random_state=1), 'param': { 'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.1], 'max_depth': [4, 6, 8], 'alpha': [0.7, 0.8, 0.9], 'max_leaf_nodes': [10, 20], 'min_samples_split': [2, 4, 7] }}, 'Lin': {'clf': linear_model.LinearRegression(), 'param': { 'fit_intercept': [True, False], 'normalize': [True, False] }}, 'Ridge': {'clf': linear_model.Ridge(), 'param': {}}, 'Lasso': {'clf': linear_model.Lasso(), 'param': {}}, 'ElasN': {'clf': linear_model.ElasticNet(), 'param': {}}, 'Lars': {'clf': linear_model.Lars(), 'param': {}}, 'Bayers': {'clf': linear_model.BayesianRidge(), 'param': {}}, 'Poly2': {'clf': Pipeline([('poly', PolynomialFeatures(degree=2)), ('std_scaler', StandardScaler()), ('line_reg', linear_model.LinearRegression()) ]), 'param': {}}, 'SGD': {'clf': linear_model.SGDRegressor(), 'param': {}}, 'SVM': {'clf': svm.SVR(kernel='rbf', C=1.0, epsilon=1), 'param': { 'C': [1, 10, 100, 1000, 10000] }}, 'Knn': {'clf': neighbors.KNeighborsRegressor(), 'param': {}}, 'RF': {'clf': ensemble.RandomForestRegressor(random_state=1), 'param': {'n_estimators': [10, 30, 50, 100, 150], }}, 'ADA': {'clf': ensemble.AdaBoostRegressor(n_estimators=100), 'param': {}}, 'BAG': {'clf': BaggingRegressor(bootstrap=True), 'param': {'n_estimators': [50, 100, 200]}}, 'ET': {'clf': tree.ExtraTreeRegressor(), 'param': {}}, }
def get_list_of_basic_models(): print(f"\nCreate list of basic models will pass through...") #! BE CAREFUL ! May take more time then expect or freeze process # @ Retun NaN in our case basic_models = [ DummyRegressor(), # # ^ ----------------------------------------- Classical linear regressors # linear_model.LinearRegression(), # linear_model.Ridge(alpha=0.5, random_state=rnd_state), # # linear_model.SGDRegressor(random_state=rnd_state), # # ^ ---------------------------------- Regressors with variable selection # linear_model.Lasso(alpha=0.1, random_state=rnd_state), # linear_model.ElasticNet(random_state=rnd_state), # # @ linear_model.LassoLars(alpha=0.1, random_state=rnd_state), # # ^ ------------------------------------------------- Bayesian regressors # # @ linear_model.BayesianRidge(), # # @ linear_model.ARDRegression(), # # ^ ------------------------------------------- Outlier-robust regressors # # @ linear_model.HuberRegressor(), # linear_model.RANSACRegressor(random_state=rnd_state), # # ^ -----------------------Generalized linear models (GLM) for regression # linear_model.TweedieRegressor(power=0, alpha=0.5, link="auto"), # # linear_model.PoissonRegressor(), # linear_model.GammaRegressor(), # # ^ ------------------------------------------------------- Miscellaneous # linear_model.PassiveAggressiveRegressor(random_state=rnd_state), # # @ KernelRidge(), # ## --------------------------------------------- Support Vector Machines # # svm.LinearSVR(random_state=rnd_state), # #! svm.NuSVR(), #! CAN FREEZE # #! svm.SVR(), #! CAN FREEZE # ^ ------------------------------------------------------ Decision Trees tree.DecisionTreeRegressor(random_state=rnd_state), tree.ExtraTreeRegressor(random_state=rnd_state), # ^ ---------------------------------------------------- Ensemble Methods # @ ensemble.HistGradientBoostingRegressor(random_state=rnd_state), # ensemble.AdaBoostRegressor(n_estimators=50, random_state=rnd_state), # ensemble.BaggingRegressor(n_estimators=50, random_state=rnd_state), # ensemble.ExtraTreesRegressor(n_estimators=100, random_state=rnd_state), #! CAN BE LOOONG # ensemble.RandomForestRegressor(n_estimators=100, random_state=rnd_state), #! CAN BE LOOONG # ensemble.GradientBoostingRegressor(n_estimators=100, random_state=rnd_state), # xgb.XGBRegressor(n_estimators=1000, random_state=rnd_state), # ^ --------------------------------------------------- Nearest Neighbors # @ neighbors.KNeighborsRegressor(), # ^ ----------------------------------------------- Neural network models # neural_network.MLPRegressor(hidden_layer_sizes=100, random_state=rnd_state), ] return basic_models
def dec_tree_reg(df, test): dt = tree.ExtraTreeRegressor() #set target, train and test. train and test must have same number of features target = df['count'] train = df[['time','holiday','season','temp','atemp','windspeed','weather','humidity']] test = test2[['time','holiday','season','temp','atemp','windspeed','weather','humidity']] dt.fit(train,target) predicted_probs = dt.predict(test) predicted_probs = pd.Series(predicted_probs) predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] #save to file submit = pd.concat([keep,predicted_probs],axis=1) # print(forest.feature_importances_) submit.columns=['datetime','count'] submit.to_csv('data/submissiondtree.csv',index=False)
def train_lotto(num_var): lotto_csv = pd.read_csv(csv_filename, names=["year", "month", "day", "midday_evening", "num_1", "num_2", "num_3"]) lotto_csv = lotto_csv.dropna() X = lotto_csv.drop(["num_1", "num_2", "num_3"], axis=1) y = lotto_csv[f"num_{num_var}"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123) # tree_model = tree.DecisionTreeRegressor() tree_model = tree.ExtraTreeRegressor() tree_model.fit(X_train, y_train) with open(model_filename, "wb") as model_file: joblib.dump(tree_model, model_file) print(f"Done Training num_{num_var}") with open(model_filename, "rb") as model_file: tree_model = joblib.load(model_filename) result = tree_model.score(X_test, y_test) print(result)
def dec_tree_reg(df, test): dt = tree.ExtraTreeRegressor() #set target, train and test. train and test must have same number of features target = df['count'] train = df[[ 'time', 'holiday', 'season', 'temp', 'atemp', 'windspeed', 'weather', 'humidity' ]] test = test2[[ 'time', 'holiday', 'season', 'temp', 'atemp', 'windspeed', 'weather', 'humidity' ]] dt.fit(train, target) predicted_probs = dt.predict(test) predicted_probs = pd.Series(predicted_probs) predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] #save to file submit = pd.concat([keep, predicted_probs], axis=1) # print(forest.feature_importances_) submit.columns = ['datetime', 'count'] submit.to_csv('data/submissiondtree.csv', index=False) plt.figure() # pl.scatter(tr, y, c="k", label="data") plt.plot(train['time'], target, c="g", label="max_depth=2", linewidth=2) plt.plot(test['time'], predicted_probs, c="r", label="max_depth=5", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Decision Tree Regression") plt.legend() plt.show()
def trees(x_train, x_test, y_train, y_test): res = [] print("hello reg trees") m = tree.DecisionTreeRegressor() m.fit(x_train, y_train) print("fiting") predictions = m.predict(x_test) acc = mean_squared_error(y_test, predictions) res.append((acc, "DecisionTreeRegressor")) m = tree.ExtraTreeRegressor() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = mean_squared_error(y_test, predictions) res.append((acc, "ExtraTreeRegressor")) print(res) return res
regression(linear_model.Lars()), regression(linear_model.LarsCV()), regression(linear_model.Lasso(random_state=RANDOM_SEED)), regression(linear_model.LassoCV(random_state=RANDOM_SEED)), regression(linear_model.LassoLars()), regression(linear_model.LassoLarsCV()), regression(linear_model.LassoLarsIC()), regression(linear_model.LinearRegression()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression( linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)), regression(linear_model.PoissonRegressor()), regression( linear_model.RANSACRegressor( base_estimator=tree.ExtraTreeRegressor(**TREE_PARAMS), random_state=RANDOM_SEED)), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)), regression(linear_model.TweedieRegressor(power=0.0)), regression(linear_model.TweedieRegressor(power=1.0)), regression(linear_model.TweedieRegressor(power=1.5)), regression(linear_model.TweedieRegressor(power=2.0)), regression(linear_model.TweedieRegressor(power=3.0)), # Statsmodels Linear Regression classification_binary( utils.StatsmodelsSklearnLikeWrapper( sm.GLM,
class ScikitLearnModelConverterTest(tf.test.TestCase, parameterized.TestCase): @parameterized.parameters( (tree.DecisionTreeRegressor(random_state=42),), (tree.ExtraTreeRegressor(random_state=42),), (ensemble.RandomForestRegressor(random_state=42),), (ensemble.ExtraTreesRegressor(random_state=42),), (ensemble.GradientBoostingRegressor(random_state=42,),), (ensemble.GradientBoostingRegressor(random_state=42, init="zero"),), (ensemble.GradientBoostingRegressor( random_state=42, init=tree.DecisionTreeRegressor(random_state=42), ),), ) def test_convert_reproduces_regression_model( self, sklearn_tree, ): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) sklearn_tree.fit(features, labels) tf_features = tf.constant(features, dtype=tf.float32) with self.subTest(msg="inference_is_reproduced_before_save"): tf_tree = scikit_learn_model_converter.convert(sklearn_tree) tf_labels = tf_tree(tf_features).numpy().ravel() sklearn_labels = sklearn_tree.predict(features).astype(np.float32) self.assertAllClose(sklearn_labels, tf_labels, rtol=1e-5) with self.subTest(msg="inference_is_reproduced_after_save"): path = pathlib.Path(self.get_temp_dir()) tf_tree = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=path / "intermediate_path", ) tf.saved_model.save(obj=tf_tree, export_dir=path) loaded_tf_tree = tf.saved_model.load(path) self.assertAllEqual(tf_tree(tf_features), loaded_tf_tree(tf_features)) @parameterized.parameters((tree.DecisionTreeClassifier(random_state=42),), (tree.ExtraTreeClassifier(random_state=42),), (ensemble.RandomForestClassifier(random_state=42),), (ensemble.ExtraTreesClassifier(random_state=42),)) def test_convert_reproduces_classification_model( self, sklearn_tree, ): features, labels = datasets.make_classification( n_samples=100, n_features=10, n_classes=4, n_clusters_per_class=1, random_state=42, ) sklearn_tree.fit(features, labels) tf_features = tf.constant(features, dtype=tf.float32) with self.subTest(msg="inference_is_reproduced_before_save"): tf_tree = scikit_learn_model_converter.convert(sklearn_tree) tf_labels = tf_tree(tf_features).numpy() sklearn_labels = sklearn_tree.predict_proba(features).astype(np.float32) self.assertAllClose(sklearn_labels, tf_labels, rtol=1e-5) with self.subTest(msg="inference_is_reproduced_after_save"): path = pathlib.Path(self.get_temp_dir()) tf_tree = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=path / "intermediate_path", ) tf.saved_model.save(obj=tf_tree, export_dir=path) loaded_tf_tree = tf.saved_model.load(path) self.assertAllEqual(tf_tree(tf_features), loaded_tf_tree(tf_features)) def test_convert_raises_when_unrecognised_model_provided(self): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) sklearn_model = linear_model.LinearRegression().fit(features, labels) with self.assertRaises(NotImplementedError): scikit_learn_model_converter.convert(sklearn_model) def test_convert_raises_when_sklearn_model_is_not_fit(self): with self.assertRaises( ValueError, msg="Scikit-learn model must be fit to data before converting to TF.", ): _ = scikit_learn_model_converter.convert(tree.DecisionTreeRegressor()) def test_convert_raises_when_regression_target_is_multivariate(self): features, labels = datasets.make_regression( n_samples=100, n_features=10, # This produces a two-dimensional target variable. n_targets=2, random_state=42, ) sklearn_tree = tree.DecisionTreeRegressor().fit(features, labels) with self.assertRaisesRegex( ValueError, "Only scalar regression and single-label classification are supported.", ): _ = scikit_learn_model_converter.convert(sklearn_tree) def test_convert_raises_when_classification_target_is_multilabel(self): features, labels = datasets.make_multilabel_classification( n_samples=100, n_features=10, # This assigns two class labels per example. n_labels=2, random_state=42, ) sklearn_tree = tree.DecisionTreeClassifier().fit(features, labels) with self.assertRaisesRegex( ValueError, "Only scalar regression and single-label classification are supported.", ): _ = scikit_learn_model_converter.convert(sklearn_tree) def test_convert_uses_intermediate_model_path_if_provided(self): features, labels = datasets.make_classification( n_samples=100, n_features=10, n_classes=4, n_clusters_per_class=1, random_state=42, ) sklearn_tree = tree.DecisionTreeClassifier().fit(features, labels) write_path = self.create_tempdir() _ = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=write_path, ) # We should be able to load the intermediate TFDF model from the given path. tfdf_tree = tf.keras.models.load_model(write_path) self.assertIsInstance(tfdf_tree, tf.keras.Model) def test_convert_sklearn_tree_to_tfdf_pytree_raises_if_weight_provided_for_classification_tree( self): features, labels = datasets.make_classification(random_state=42) sklearn_tree = tree.DecisionTreeClassifier(random_state=42).fit( features, labels, ) with self.assertRaisesRegex( ValueError, "weight should not be passed for classification trees.", ): _ = scikit_learn_model_converter.convert_sklearn_tree_to_tfdf_pytree( sklearn_tree, weight=0.5, ) def test_convert_raises_when_gbt_initial_estimator_is_not_tree_or_constant( self): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) init_estimator = linear_model.LinearRegression() sklearn_model = ensemble.GradientBoostingRegressor(init=init_estimator) sklearn_model.fit(features, labels) with self.assertRaises(ValueError): _ = scikit_learn_model_converter.convert(sklearn_model)
#%% Import models's libraries from sklearn import tree # Canonical Decision tree & Extremely randomized tree from sklearn import ensemble # RF, Gradient Boosting, AdaBoost from skopt.space import Real, Categorical, Integer import xgboost as xgb #%% Toggles to go through random_state = 42 #%% base_estimator = tree.ExtraTreeRegressor base_xt_reg = tree.ExtraTreeRegressor( criterion="mse", # {"mse", "friedman_mse", "mae"} default="mse" splitter="random", # {"random", "best"} default="random" max_depth=None, # int, default=None min_samples_split=2, # int or float, default=2 min_samples_leaf=1, # int or float, default=1 min_weight_fraction_leaf=0.0, # float, default=0.0 max_features=None, # int, float or {“auto”, “sqrt”, “log2”}, default=None random_state=random_state, ) #%% base_estimator = tree.DecisionTreeRegressor base_dt_reg = tree.DecisionTreeRegressor( criterion="mse", # {"mse", "friedman_mse", ""mae"} default="mse" splitter="best", # {"random", "best"} default="best" max_depth=None, # int, default=None min_samples_split=2, # int or float, default=2 min_samples_leaf=1, # int or float, default=1 min_weight_fraction_leaf=0.0, # float, default=0.0 max_features=None, # int, float or {“auto”, “sqrt”, “log2”}, default=None random_state=random_state,
def run(perc): base_string = 'D:\MriData\Data' excel_path = 'D:\oasis_cross-sectional.csv' test = 13 dataprovider = CrossSectionalData.CrossSectionalDataProvider( base_string, excel_path) a = dataprovider.get_data_with_CDR() step = 5 step_1 = 25 step_2 = 25 training_stop = int(len(a) * perc) allfeatures = [] ally = [] cut = 55 randomint = random.Random(7) for xx in [ randomint.randint(0, len(a) - 1) for r in xrange(training_stop) ]: x = a[xx] cdr = dataprovider.get_CDR(x) ll = (dataprovider.retrieve_full_data(x)) if cdr == None or cdr > 1: continue feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_1, [dataprovider.get_gender(x)]) allfeatures += feat ally = np.append(ally, np.repeat(cdr, len(feat))) AlzheimerFeatures.shuffle_in_unison_scary(allfeatures, ally) regressor = sk.ExtraTreeRegressor(random_state=0) regressor.fit(allfeatures, ally) allfeatures1 = [] ally1 = [] indices = [] for xx in [ randomint.randint(0, len(a) - 1) for r in xrange(training_stop) if not r == test ]: x = a[xx] indices.append(xx) cdr = dataprovider.get_CDR(x) ll = dataprovider.retrieve_full_data(x) if cdr == None or cdr > 1: continue feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_2, [dataprovider.get_gender(x)]) allfeatures1.append(regressor.predict(feat)[0:cut]) ally1.append(cdr) rbf_svc = neighbors.KNeighborsClassifier(n_neighbors=7) rbf_svc.fit(allfeatures1, ally1) errorb = 0 error = 0 index = 0 for xx in xrange(len(a)): x = a[xx] cdr = dataprovider.get_CDR(x) if cdr == None or cdr > 1 or xx in indices: continue ll = dataprovider.retrieve_full_data(x) feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_2, [dataprovider.get_gender(x)]) predictq = regressor.predict(feat)[:cut] suma = (rbf_svc.predict(predictq)) if not (suma > 0 and cdr > 0) or suma == cdr: errorb += 1 error += np.abs(suma - cdr) index += 1 ter = 1 - (error / index) terb = 1 - (errorb / index) print(str(ter) + " , " + str(terb)) return ter, terb
print(f"X_fit: {X_fit.shape}, {type(X_fit)}\ \nX_train: {X_train.shape}, {type(X_train)}\ \nX_val: {X_val.shape}, {type(X_val)}\n") print(f"y_fit: {y_fit.shape}, {type(y_fit)}\ \ny_train: {y_train.shape}, {type(y_train)}\ \ny_val: {y_val.shape}, {type(y_val)}\n") #%% Define model parameters for starting tuning model_params = { "base_estimator": tree.ExtraTreeRegressor( criterion="mse", # {"mse", "friedman_mse", ""mae"} default="mse" splitter="random", # {"random", "best"} default="random" max_depth=None, # default=None min_samples_split=2, # default=2 min_samples_leaf=1, # default=1 random_state=random_state, ), "n_estimators": args.n_estimators, "max_samples": args.max_samples, "max_features": args.max_features, "bootstrap": args.bootstrap, "bootstrap_features": args.bootstrap_features, "oob_score": False,
model = linear_model.RidgeCV(alphas=alphas) model = linear_model.LassoLarsCV() model = linear_model.LassoLars() model = linear_model.ElasticNetCV(l1_ratio=0.8, alphas=alphas) model = linear_model.BayesianRidge() model = linear_model.Perceptron() from sklearn import svm model = svm.SVR(kernel='linear') model = svm.SVR(kernel='poly') model = svm.SVR(kernel='rbf') from sklearn import tree model = tree.DecisionTreeRegressor() model = tree.ExtraTreeRegressor() from sklearn import ensemble model = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) model = ensemble.ExtraTreesRegressor(n_estimators=20, max_depth=None, min_samples_split=1, random_state=0) model = ensemble.AdaBoostRegressor(n_estimators=100) model = ensemble.GradientBoostingRegressor(n_estimators=100,
logger = logging.getLogger("sedesol.pipeline") ############################################################################### # Constants, specifying possible models and metrics ############################################################################### MODELS_MAPPING = { "elnet": lm.ElasticNet(), "sgd_class": lm.SGDClassifier(), "sgd_reg": lm.SGDRegressor(), "ridge": lm.Ridge(), "gp": GaussianProcess(), "tree_reg": tree.DecisionTreeRegressor(), "tree_class": tree.DecisionTreeClassifier(), "extra_class": ensemble.ExtraTreesClassifier(), "extra_reg": tree.ExtraTreeRegressor(), "nn_class": KNeighborsClassifier(), "rf_reg": ensemble.RandomForestRegressor(), "rf_class": ensemble.RandomForestClassifier(), "svc": svm.SVC(), "linear_svc": svm.LinearSVC(), "logistic_reg": lm.LogisticRegression(), "multitask_lasso": lm.MultiTaskLasso(), "linear_reg": lm.LinearRegression() } MULTITASK_MODELS = ["multitask_lasso"] ############################################################################### # Helper functions ###############################################################################
x = a[xx] cdr = dataprovider.get_CDR(x) print(cdr) ll = (dataprovider.retrieve_full_data(x)) #AlzheimerFeatures.view_histogram(ll) #CrossSectionalData.show_slices([ll[:,:,50]]) if cdr == None or cdr > 1: continue feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_1, [dataprovider.get_gender(x)]) allfeatures += feat ally = np.append(ally, np.repeat(cdr, len(feat))) AlzheimerFeatures.shuffle_in_unison_scary(allfeatures, ally) regressor = sk.ExtraTreeRegressor(random_state=0) regressor.fit(allfeatures, ally) net = GaussianNB() net.fit(np.array(allfeatures), np.array(ally)) def f(x): if x == 0.5: return 0 if x == 0: return -1 return 1 ttt = AlzheimerFeatures.target_brain_regions_2d_z( dataprovider.retrieve_full_data(test), step, [dataprovider.get_gender(test)], lambda x: f(net.predict(x)), 50)
def model_comparison(): data, target = load_train() pipeline = create_pipeline() data = pipeline.fit_transform(data) MLA = [ #Ensemble Methods ensemble.AdaBoostRegressor(), ensemble.BaggingRegressor(), ensemble.ExtraTreesRegressor(), ensemble.GradientBoostingRegressor(), ensemble.RandomForestRegressor(), #Gaussian Processes gaussian_process.GaussianProcessRegressor(), #GLM linear_model.PassiveAggressiveRegressor(), linear_model.Ridge(), linear_model.Lasso(), linear_model.ElasticNet(), linear_model.SGDRegressor(), #Nearest Neighbor neighbors.KNeighborsRegressor(), #SVM svm.SVR(), svm.NuSVR(), svm.LinearSVR(), #Trees tree.DecisionTreeRegressor(), tree.ExtraTreeRegressor(), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBRegressor(), lgb.LGBMRegressor() ] #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10% #create table to compare MLA metrics MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean'] MLA_compare = pd.DataFrame(columns = MLA_columns) #index through MLA and save performance to table row_index = 0 for alg in MLA: #set name and parameters MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate rmse_scorer = make_scorer(rmse) cv_results = model_selection.cross_validate(alg, data, target, cv = cv_split, scoring = rmse_scorer) MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean() MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen! row_index+=1 #print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], inplace = True) MLA_compare.to_csv('mla_comparison.csv', index=True) print(MLA_compare)
from sklearn.model_selection import cross_val_score #%% Árvore de Decisão model=tree.DecisionTreeRegressor() scores=cross_val_score(model,X,Y,cv=10) print("*R2:") print(scores.mean()) print("*Desvio Padrão:") print(scores.std()) #%% Árvore de Decisão - Critério MSE model=tree.DecisionTreeRegressor(criterion="mse") scores=cross_val_score(model,X,Y,cv=10) print("\nmse - R2:") print(scores.mean()) print("mse - Desvio Padrão:") print(scores.std()) #%% Extra Tree model=tree.ExtraTreeRegressor(criterion="mse") scores=cross_val_score(model,X,Y,cv=10) print("\nExtra/mse - R2:") print(scores.mean()) print("Extra/mse - Desvio Padrão:") print(scores.std())
from sklearn import tree clf = tree.ExtraTreeRegressor() # __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", # "ExtraTreeClassifier", "ExtraTreeRegressor", "export_graphviz"] # [height, weight, shoe_size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] # Y = ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', # 'female', 'male', 'male'] Y = [1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] clf = clf.fit(X,Y) prediction = clf.predict([[190,70,42]]) print prediction[0]
#models.append( {"name": "1.9.1. GaussianNB", \ # "model": naive_bayes.GaussianNB()} ) # doesn't work for this dataset? #models.append( {"name": "1.9.2. MultinomialNB", \ # "model": naive_bayes.MultinomialNB()} ) # doesn't work for this dataset? #models.append( {"name": "1.9.3. BernoulliNB", \ # "model": naive_bayes.BernoulliNB()} ) ## 1.10. Decision Trees models.append( {"name": "1.10. DecisionTreeRegressor", \ "model": tree.DecisionTreeRegressor(random_state=0)} ) models.append( {"name": "1.10. ExtraTreeRegressor", \ "model": tree.ExtraTreeRegressor(random_state=0)} ) ## 1.11. Ensemble methods # averaging methods models.append( {"name": "1.11.1. Bagging meta-estimator", \ "model": ensemble.BaggingRegressor(neighbors.KNeighborsRegressor())} ) models.append( {"name": "1.11.2.1. Random Forests", \ "model": ensemble.RandomForestRegressor()} ) models.append( {"name": "1.11.2.2. Extremely Randomized Trees", \ "model": ensemble.ExtraTreesRegressor()} ) models.append( {"name": "1.11.3. AdaBoost", \ "model": ensemble.AdaBoostRegressor()} ) models.append( {"name": "1.11.4. Gradient Tree Boosting", \ "model": ensemble.GradientBoostingRegressor()} ) ## 1.12. Multiclass and multilabel algorithms
classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)), regression(tree.ExtraTreeRegressor(**TREE_PARAMS)), classification(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification(tree.ExtraTreeClassifier(**TREE_PARAMS)), classification_binary(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification_binary(tree.ExtraTreeClassifier(**TREE_PARAMS)), # Random forest regression(ensemble.RandomForestRegressor(**FOREST_PARAMS)), regression(ensemble.ExtraTreesRegressor(**FOREST_PARAMS)), classification(ensemble.RandomForestClassifier(**FOREST_PARAMS)), classification(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), classification_binary( ensemble.RandomForestClassifier(**FOREST_PARAMS)), classification_binary(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), ],
classifier = tree.DecisionTreeClassifier() classifier.fit(X=X_train, y=y_train) predicted = classifier.predict(X_test) DecisionTreeClassifier_accuracy.append(accuracy_score(y_test, predicted)) classifier = tree.DecisionTreeRegressor() classifier.fit(X=X_train, y=y_train) predicted = classifier.predict(X_test) DecisionTreeRegressor_accuracy.append(accuracy_score(y_test, predicted)) classifier = tree.ExtraTreeClassifier() classifier.fit(X=X_train, y=y_train) predicted = classifier.predict(X_test) ExtraTreeClassifier_accuracy.append(accuracy_score(y_test, predicted)) classifier = tree.ExtraTreeRegressor() classifier.fit(X=X_train, y=y_train) predicted = classifier.predict(X_test) ExtraTreeRegressor_accuracy.append(accuracy_score(y_test, predicted)) ''' percentages = np.arange(0.05, 0.95, 0.05) BernoulliNB_accuracy = [] #CategoricalNB_accuracy = [] ComplementNB_accuracy = [] GaussianNB_accuracy = [] MultinomialNB_accuracy = [] DecisionTreeClassifier_accuracy = [] DecisionTreeRegressor_accuracy = [] ExtraTreeClassifier_accuracy = [] ExtraTreeRegressor_accuracy = []
print("KNN:%f" % mse) model = ensemble.RandomForestRegressor(n_estimators=20, random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("随机森林:%f" % mse) model = ensemble.GradientBoostingRegressor(n_estimators=100, random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("GBRT:%f" % mse) model = ensemble.BaggingRegressor(random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("Bagging:%f" % mse) model = tree.ExtraTreeRegressor(random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("ExtraTree:%f" % mse) model = ensemble.AdaBoostRegressor(n_estimators=50, random_state=random_state) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("Adaboost:%f" % mse) model = svm.SVR(C=10) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("SVC:%f" % mse)
# 'pls':cross_decomposition.PLSRegression(),报错 'gradient boosting': ensemble.GradientBoostingRegressor(), # 'gaussian':gaussian_process.GaussianProcessRegressor(),报错 # 'isotonic':isotonic.IsotonicRegression(),报错 'kernelridge': kernel_ridge.KernelRidge(), 'ARD': linear_model.ARDRegression(), 'bayesianridge': linear_model.BayesianRidge(), # 'elasticnet':linear_model.ElasticNet(),#报错 'HuberRegressor': linear_model.HuberRegressor(), 'LinearRegression': linear_model.LinearRegression(), # 'logistic':linear_model.LogisticRegression(),报错 # 'linear_model.RidgeClassifier':linear_model.RidgeClassifier(),报错 'k-neighbor': neighbors.KNeighborsRegressor(), 'SVR': svm.LinearSVR(), 'NUSVR': svm.NuSVR(), 'extra tree': tree.ExtraTreeRegressor(), 'decesion tree': tree.DecisionTreeRegressor(), # 'random losgistic':linear_model.RandomizedLogisticRegression(),报错 # 'dummy':dummy.DummyRegressor()报错 } #回归分析 cv = StratifiedKFold(n_splits=5) i = 0 X = train_data y = probs z = labels[:, 5] clf = ExtraTreesClassifier() from sklearn.ensemble import ExtraTreesClassifier for name, rgs in Regressors.items(): regressor = rgs
('bag', BaggingRegressor()), ('etr', ExtraTreesRegressor()), ('gbr', GradientBoostingRegressor()), ('xgbr', xgb.XGBRegressor(max_depth=3)), # xgb.XGBRegressor()), # ('rfr', RandomForestRegressor(n_estimators=50)), #Nearest Neighbor ('knr', neighbors.KNeighborsRegressor(n_neighbors=3)), #SVM ('svr', svm.SVR(kernel='rbf', gamma=0.1)), ('lsvr', svm.LinearSVR()), #Trees ('dtr', tree.DecisionTreeRegressor()), ('etr2', tree.ExtraTreeRegressor()), ] ESTS_PARAM_GRID = { 'lasso': [{ 'alpha': [0.0005], 'random_state': [1] }], # first model is used for meta model in StackingAveragedModels 'xgbr': [{ 'colsample_bytree': [0.4603], 'gamma': [0.0468], 'learning_rate': [0.05], 'max_depth': [3], 'min_child_weight': [1.7817], 'n_estimators': [2200], 'reg_alpha': [0.4640], 'reg_lambda': [0.8571],
from sklearn import neighbors from sklearn import ensemble import xgboost as xgb #Xgboost Regressor model_DecisionTreeRegressor = tree.DecisionTreeRegressor( ) #Decision Tree Regressor model_SVR = svm.SVR(gamma='auto') #SVM Regressor model_KNeighborsRegressor = neighbors.KNeighborsRegressor( ) #K Neighbors Regressor model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) #Random Forest Regressor model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) #Adaboost Regressor model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( n_estimators=100) #Gradient Boosting Random Forest Regressor model_BaggingRegressor = ensemble.BaggingRegressor() #Bagging Regressor model_ExtraTreeRegressor = tree.ExtraTreeRegressor() #ExtraTree Regressor def linear_model(X_train, y_train): regr = LinearRegression() regr.fit(X_train, y_train) y_pred = regr.predict(X_train) y_test = y_train print("linear score on training set: ", mean_absolute_error(y_test, y_pred)) #plt.figure(figsize=(14,4)) #plt.scatter(X_train, y_train, color='g') #plt.plot(X_train, y_pred, color='r') #plt.xlabel('time(0-24)')
def generate_prediction(cls, race): """Generate a prediction for the specified race""" prediction = { 'race_id': race['_id'], 'earliest_date': cls.get_earliest_date(), 'prediction_version': cls.PREDICTION_VERSION, 'seed_version': Seed.SEED_VERSION, 'results': None, 'score': None, 'train_seeds': None, 'test_seeds': None, 'estimator': None } predictor = None generate_predictor = False segment = tuple(race['entry_conditions']) + tuple( [race['track_condition']]) with cls.predictor_cache_lock: if segment in cls.predictor_cache: predictor = cls.predictor_cache[segment] else: cls.predictor_cache[segment] = None generate_predictor = True if generate_predictor: similar_races = pyracing.Race.find({ 'entry_conditions': race['entry_conditions'], 'track_condition': race['track_condition'], 'start_time': { '$lt': race.meet['date'] } }) if len(similar_races) >= (1 / cls.TEST_SIZE): try: train_races, test_races = cross_validation.train_test_split( similar_races, test_size=cls.TEST_SIZE) train_X = [] train_y = [] for train_race in train_races: for seed in train_race.seeds: if seed['result'] is not None: train_X.append(seed.normalized_data) train_y.append(seed['result']) test_X = [] test_y = [] for test_race in test_races: for seed in test_race.seeds: if seed['result'] is not None: test_X.append(seed.normalized_data) test_y.append(seed['result']) predictor = { 'classifier': None, 'score': None, 'train_seeds': len(train_y), 'test_seeds': len(test_y), 'estimator': None } dual = len(train_X) < len(train_X[0]) kernel = 'linear' loss = 'epsilon_insensitive' if not dual: loss = 'squared_epsilon_insensitive' for estimator in ( linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.LinearRegression(), linear_model.LogisticRegression(), linear_model.OrthogonalMatchingPursuit(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.Ridge(), linear_model.SGDRegressor(), svm.SVR(kernel=kernel), svm.LinearSVR(dual=dual, loss=loss), svm.NuSVR(kernel=kernel), tree.DecisionTreeRegressor(), tree.ExtraTreeRegressor()): logging.debug( 'Trying {estimator} for {segment}'.format( estimator=estimator.__class__.__name__, segment=segment)) try: classifier = pipeline.Pipeline([ ('feature_selection', feature_selection.SelectFromModel( estimator, 'mean')), ('regression', estimator) ]) classifier.fit(train_X, train_y) score = classifier.score(test_X, test_y) if predictor['classifier'] is None or predictor[ 'score'] is None or score > predictor[ 'score']: logging.debug( 'Using {estimator} ({score}) for {segment}' .format( estimator=estimator.__class__.__name__, score=score, segment=segment)) predictor['classifier'] = classifier predictor['score'] = score predictor[ 'estimator'] = estimator.__class__.__name__ except BaseException as e: logging.debug( 'Caught exception while trying {estimator} for {segment}: {exception}' .format(estimator=estimator.__class__.__name__, segment=segment, exception=e)) continue cls.predictor_cache[segment] = predictor except: del cls.predictor_cache[segment] raise else: del cls.predictor_cache[segment] else: while predictor is None: try: predictor = cls.predictor_cache[segment] time.sleep(10) except KeyError: break if predictor is not None: reverse = False if 'score' in predictor and predictor['score'] is not None: reverse = predictor['score'] < 0 prediction['score'] = abs(predictor['score']) if 'classifier' in predictor and predictor[ 'classifier'] is not None: raw_results = {} for seed in race.seeds: raw_result = predictor['classifier'].predict( numpy.array(seed.normalized_data).reshape(1, -1))[0] if raw_result is not None: if not raw_result in raw_results: raw_results[raw_result] = [] raw_results[raw_result].append(seed.runner['number']) for key in sorted(raw_results.keys(), reverse=reverse): if prediction['results'] is None: prediction['results'] = [] prediction['results'].append( sorted([number for number in raw_results[key]])) if 'train_seeds' in predictor: prediction['train_seeds'] = predictor['train_seeds'] if 'test_seeds' in predictor: prediction['test_seeds'] = predictor['test_seeds'] if 'estimator' in predictor: prediction['estimator'] = predictor['estimator'] return prediction