def test_multioutput(self): # http://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py from sklearn.multioutput import MultiOutputRegressor from sklearn.ensemble import RandomForestRegressor # Create a random dataset rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(600, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T y += (0.5 - rng.rand(*y.shape)) df = pdml.ModelFrame(X, target=y) max_depth = 30 rf1 = df.ensemble.RandomForestRegressor(max_depth=max_depth, random_state=self.random_state) reg1 = df.multioutput.MultiOutputRegressor(rf1) rf2 = RandomForestRegressor(max_depth=max_depth, random_state=self.random_state) reg2 = MultiOutputRegressor(rf2) df.fit(reg1) reg2.fit(X, y) result = df.predict(reg2) expected = pd.DataFrame(reg2.predict(X)) tm.assert_frame_equal(result, expected)
def test_multi_target_sample_weights_api(): X = [[1, 2, 3], [4, 5, 6]] y = [[3.141, 2.718], [2.718, 3.141]] w = [0.8, 0.6] rgr = MultiOutputRegressor(Lasso()) assert_raises_regex(ValueError, "does not support sample weights", rgr.fit, X, y, w) # no exception should be raised if the base estimator supports weights rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X, y, w)
def test_acquisition_per_second_gradient(acq_func): rng = np.random.RandomState(0) X = rng.randn(20, 10) # Make the second component large, so that mean_grad and std_grad # do not become zero. y = np.vstack((X[:, 0], np.abs(X[:, 0])**3)).T for X_new in [rng.randn(10), rng.randn(10)]: gpr = cook_estimator("GP", Space(((-5.0, 5.0),)), random_state=0) mor = MultiOutputRegressor(gpr) mor.fit(X, y) check_gradient_correctness(X_new, mor, acq_func, 1.5)
def test_multi_target_sparse_regression(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test = X[50:] for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: rgr = MultiOutputRegressor(Lasso(random_state=0)) rgr_sparse = MultiOutputRegressor(Lasso(random_state=0)) rgr.fit(X_train, y_train) rgr_sparse.fit(sparse(X_train), y_train) assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
def test_multi_target_sample_weight_partial_fit(): # weighted regressor X = [[1, 2, 3], [4, 5, 6]] y = [[3.141, 2.718], [2.718, 3.141]] w = [2., 1.] rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0)) rgr_w.partial_fit(X, y, w) # weighted with different weights w = [2., 2.] rgr = MultiOutputRegressor(SGDRegressor(random_state=0)) rgr.partial_fit(X, y, w) assert_not_equal(rgr.predict(X)[0][0], rgr_w.predict(X)[0][0])
def test_multi_target_sample_weights(): # weighted regressor Xw = [[1, 2, 3], [4, 5, 6]] yw = [[3.141, 2.718], [2.718, 3.141]] w = [2., 1.] rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]] y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]] rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
def run_one_configuration( full_train_covariate_matrix, complete_target, new_valid_covariate_data_frames, new_valid_target_data_frame, std_data_frame, target_clusters, featurizer, model_name, parameters, log_file, ): model_baseline = dict() model_baseline["type"] = model_name model_baseline["target_clusters"] = target_clusters if model_name == "multi_task_lasso": model = MultiTaskLasso(max_iter=5000, **parameters) elif model_name == "xgboost": model = MultiOutputRegressor( XGBRegressor(n_jobs=10, objective="reg:squarederror", verbosity=0, **parameters)) model.fit(featurizer(full_train_covariate_matrix), complete_target.to_numpy(copy=True)) model_baseline["model"] = lambda x: model.predict(featurizer(x)) skill, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "skill", ) cos_sim, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "cosine-sim", ) with open(log_file, "a") as f: f.write(f"{len(target_clusters)} {parameters} {skill} {cos_sim}\n")
def first_stage(): return GridSearchCVList([ LinearRegression(), WeightedMultiTaskLasso( alpha=0.05, fit_intercept=True, tol=1e-6, random_state=123), RandomForestRegressor(n_estimators=100, max_depth=3, min_samples_leaf=10, random_state=123), MultiOutputRegressor( GradientBoostingRegressor(n_estimators=20, max_depth=3, min_samples_leaf=10, random_state=123)) ], param_grid_list=[{}, {}, {}, {}], cv=3, iid=True)
def load_SVM(): ''' Loads Support Vector Machine and gives a name for the output files. Parameters : None Returns : model_name : (str) Name of the model for output file. clf : (Classifier) Building and Floor Classifier regr : (REgressor) Longitude and Latitude Regressor ''' model_name = "Support Vector Machine" clf = SVC(C=100, kernel="linear", max_iter=1000) clf = MultiOutputClassifier(clf) regr = SVR(C=100, kernel="linear", max_iter=1000) regr = MultiOutputRegressor(regr) return model_name, clf, regr
def randomSearch(base_model, random_grid): random = RandomizedSearchCV(MultiOutputRegressor(base_model), param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1) random.fit(train_X, train_y) print(random.best_params_) best_random = random.best_estimator_ pred_y_train = best_random.predict(train_X) print_scores(train_y_array, pred_y_train) pred_y_test = best_random.predict(test_X) print_scores(test_y_array, pred_y_test) pred_y_dev = best_random.predict(dev_X) print_scores(dev_y_array, pred_y_dev)
def crossValidationMLPR(X, Y): """Fonction qui essaie plusieurs possibilités""" # On découpe le set en set d'enrtainement et de validation print("***Decoupe le set de validation***") x_train, x_validation, y_train, y_validation_txt = train_test_split( X, Y, stratify=Y, test_size=0.2, shuffle=True) y_train, y_validation = transformerGranuArgi( y_train), transformerGranuArgi(y_validation_txt) print('***Definition des parametres a tester***') param = { 'hidden_layer_sizes': [ tuple(np.random.randint(20, 35, np.random.randint(3, 5, 1))) for _ in range(5) ] } print('***Definition des modeles a entrainer***') mlpr = [ MLPRegressor(solver='adam', max_iter=1000, alpha=1e-5, activation='tanh', hidden_layer_sizes=param['hidden_layer_sizes'][i]) for i in range(len(param['hidden_layer_sizes'])) ] multioutput_rna = [MultiOutputRegressor(modele) for modele in mlpr] # Score de resultat justes sur le set de validation resultat_sur_validation = [ 0 for _ in range(len(param['hidden_layer_sizes'])) ] for i, modele in enumerate(multioutput_rna): print( f"[Entrainement du modele {i}] Couches de neurones : {param['hidden_layer_sizes'][i]}" ) modele.fit(x_train, y_train) print(modele.score(x_validation, y_validation)) y_res = modele.predict(x_validation) y_res = conversionPredictionSol(y_res) print(scorePrediction(y_res, np.array(y_validation_txt))) print('\n')
def create_model(self, C=-1, gamma=-1, epsilon=-1): # questo controllo serve per dire che di solito uso i valori di default scelti da me (inizializzati nel costruttore), # altrimenti usi i valori passati come parametro if (C == -1): C = self.C if (gamma == -1): gamma = self.gamma if (epsilon == -1): epsilon = self.epsilon self.model = SVR(C=C, gamma=gamma, epsilon=epsilon) if ( self.output_multi ): # se uso molteplici y, devo fare il wrapping del svr in modo da saperle gestire multi_output_model = MultiOutputRegressor(estimator=self.model) self.model = multi_output_model print self.model return self.model
def gbr_model(yvar, n_estimators, max_depth, min_samples_leaf, min_samples_split, max_features, loss): if max_features != 'auto': max_features = int(max_features) n_estimators, min_samples_leaf, min_samples_split, max_depth = \ int(n_estimators), int(min_samples_leaf), int(min_samples_split), int(max_depth) reg = GradientBoostingRegressor(random_state=42, max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, loss=loss, min_samples_split=min_samples_split) if yvar.shape[1] == 1: reg_trans = reg if yvar.shape[1] != 1: reg_trans = MultiOutputRegressor(reg, n_jobs=-1) return reg_trans
def train_consumer(): cdf = pd.read_csv(CONSUMER_TRAINING) xs = ['risk', 'delta_risk', 'grat_payoff', 'delta_grat_payoff',\ 'inv_payoff', 'delta_inv_payoff', 'surface_area_risk_factor',\ 'delta_surface_area_risk_factor'] ys = ['GREED', 'FOCUS', 'SPEND', 'INVEST'] cx, cy = cdf[xs], cdf[ys] ''' will use multi-output regressor ''' model = MultiOutputRegressor( GradientBoostingRegressor(random_state=0)).fit(cx, cy) # clear CMODEL_FILE open(CMODEL_FILE, 'w').close() pickle.dump(model, open(CMODEL_FILE, 'wb'))
def _check_arguments(self, base_estimator, n_initial_points, acq_optimizer, dimensions): """Check arguments for sanity.""" if isinstance(base_estimator, str): base_estimator = cook_estimator(base_estimator, space=dimensions, random_state=self.rng.randint( 0, np.iinfo(np.int32).max)) if not is_regressor(base_estimator) and base_estimator is not None: raise ValueError("%s has to be a regressor." % base_estimator) is_multi_regressor = isinstance(base_estimator, MultiOutputRegressor) if "ps" in self.acq_func and not is_multi_regressor: self.base_estimator_ = MultiOutputRegressor(base_estimator) else: self.base_estimator_ = base_estimator if n_initial_points < 0: raise ValueError("Expected `n_initial_points` >= 0, got %d" % n_initial_points) self._n_initial_points = n_initial_points self.n_initial_points_ = n_initial_points if acq_optimizer == "auto": if has_gradients(self.base_estimator_): acq_optimizer = "lbfgs" else: acq_optimizer = "sampling" if acq_optimizer not in ["lbfgs", "sampling"]: raise ValueError("Expected acq_optimizer to be 'lbfgs' or " "'sampling', got {0}".format(acq_optimizer)) if (not has_gradients(self.base_estimator_) and acq_optimizer != "sampling"): raise ValueError("The regressor {0} should run with " "acq_optimizer" "='sampling'.".format(type(base_estimator))) self.acq_optimizer = acq_optimizer
def simulate(self, x_train, y_train, regression=True): writer = csv.writer( open("CSVResult/CSVResultDuringSimulation.csv", 'w')) writer.writerow(self.title) for simulation in self.HyperParameterArray: start_time = time.time() if regression: svr = svm.SVR(kernel=simulation.kernel, gamma=simulation.gamma, coef0=simulation.coef, degree=simulation.degree, C=simulation.C, epsilon=simulation.epsilon) SVRegressor = MultiOutputRegressor(svr, n_jobs=8) else: SVRegressor = svm.SVC(kernel=simulation.kernel, gamma=simulation.gamma, coef0=simulation.coef, degree=simulation.degree, C=simulation.C) # I can evaluate the model also with cross Validation # CrossValidationScores = cross_val_score(SVRegressor, x_train, y_train, cv=5) # I can evaluate the model with kfold validation valScore, TrainingScore = validation.kFoldCross( SVRegressor.fit, SVRegressor.predict, x_train, y_train, n_splits=self.kfoldDim) valScore = np.array(valScore) TrainingScore = np.array(TrainingScore) timeSimulation = abs(time.time() - start_time) simulation.SaveResult(valScore, TrainingScore, timeSimulation) print("\n") print("Validation error: %0.2f (+/- %0.2f)" % (valScore.mean(), valScore.std() * 2)) print("Training Error: %0.2f (+/- %0.2f)" % (TrainingScore.mean(), TrainingScore.std() * 2)) print("time = %0.2f" % timeSimulation) param = simulation.getValue() writer.writerow(param)
def test_sklearn_multioutput_regressor(self): for n_targets in [2, 3, 4]: for model_class in [DecisionTreeRegressor, ExtraTreesRegressor, RandomForestRegressor, LinearRegression]: seed = random.randint(0, 2**32 - 1) if model_class != LinearRegression: model = MultiOutputRegressor(model_class(random_state=seed)) else: model = MultiOutputRegressor(model_class()) X, y = datasets.make_regression( n_samples=50, n_features=10, n_informative=5, n_targets=n_targets, random_state=seed ) X = X.astype("float32") y = y.astype("float32") model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch", extra_config={constants.TREE_OP_PRECISION_DTYPE: "float64"}) self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-5, atol=1e-4, err_msg="{}/{}/{}".format(n_targets, model_class, seed))
def create_model(model_type, shape=None, config=None): if model_type == 'Multi-Output Gradient Boosting Regressor': random_state = 0 if config: data = json.loads(config) random_state = data.get('random_state') if data.get( 'random_state') else 0 model = MultiOutputRegressor( GradientBoostingRegressor(random_state=random_state)) return model if model_type == 'Gradient Boosting Regressor': random_state = 0 if config: data = json.loads(config) random_state = data.get('random_state') if data.get( 'random_state') else 0 model = GradientBoostingRegressor(random_state=random_state) return model if model_type == "SVM Classification": ''' Pros: It works really well with clear margin of separation It is effective in high dimensional spaces It is effective in cases where number of dimensions is greater than the number of samples. It uses a subset of training points in the decision function (called support vectors), so it is also memory efficient Cons : It doesnt perform well, when we have large data set because the required training time is higher. It also doesnt perform very well, when the data set has more noise i.e. target classes are overlapping. SVM doesn't directly provide probability estimates, these are calculated using an expensive five-fold cross validation. It is related SVC method of Python scikit learn library ''' return __create_svm_classifier__(config) if model_type == "SVM Regression": return __create_svm_regressor__(config) if model_type == "KNN Classifier": return __create_KNN_classifier(config) if model_type == 'Keras Sequential Model': return __create_sequential_model(config)
def generate_joint_model(single_model): model = MultiOutputRegressor(single_model) model.fit(X_train, Y_train) score_train = model.score(X_train, Y_train) print('Score of train', round(score_train * 100, 1), "%") score = model.score(X_test, Y_test) print('Score of test', round(score * 100, 1), "%") model_path = model_folder + r"/" + \ str(round(score, 3)).replace('.', '_') + r"_" + \ str(model.get_params()['estimator']).split('(')[0] + \ '.joblib' joblib.dump(model, model_path) print("Save model file", model_path) return model, model_path
def base_estimator(self, value): # Build `base_estimator` if string given if isinstance(value, str): value = cook_estimator(value, space=self.space, random_state=self.rng.randint( 0, np.iinfo(np.int32).max)) # Check if regressor if not is_regressor(value) and value is not None: raise ValueError( f"`base_estimator` must be a regressor. Got {value}") # Treat per second acquisition function specially is_multi_regressor = isinstance(value, MultiOutputRegressor) if self.acq_func.endswith("ps") and not is_multi_regressor: value = MultiOutputRegressor(value) self._base_estimator = value
def test_multi_target_regression_partial_fit(): X, y = datasets.make_regression(n_targets=3) X_train, y_train = X[:50], y[:50] X_test, y_test = X[50:], y[50:] references = np.zeros_like(y_test) half_index = 25 for n in range(3): sgr = SGDRegressor(random_state=0) sgr.partial_fit(X_train[:half_index], y_train[:half_index, n]) sgr.partial_fit(X_train[half_index:], y_train[half_index:, n]) references[:, n] = sgr.predict(X_test) sgr = MultiOutputRegressor(SGDRegressor(random_state=0)) sgr.partial_fit(X_train[:half_index], y_train[:half_index]) sgr.partial_fit(X_train[half_index:], y_train[half_index:]) y_pred = sgr.predict(X_test) assert_almost_equal(references, y_pred)
def test_diff_detector_threshold(n_features_y: int, n_features_x: int): """ Basic construction logic of thresholds_ attribute in the DiffBasedAnomalyDetector """ X = np.random.random((100, n_features_x)) y = np.random.random((100, n_features_y)) model = DiffBasedAnomalyDetector(base_estimator=MultiOutputRegressor( estimator=LinearRegression())) # Model has own implementation of cross_validate assert hasattr(model, "cross_validate") # When initialized it should not have a threshold calculated. assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") assert not hasattr(model, "aggregate_thresholds_per_fold_") model.fit(X, y) # Until it has done cross validation, it has no threshold. assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") assert not hasattr(model, "aggregate_thresholds_per_fold_") # Calling cross validate should set the threshold for it. model.cross_validate(X=X, y=y) # Now we have calculated thresholds based on cross validation folds assert hasattr(model, "feature_thresholds_") assert hasattr(model, "aggregate_threshold_") assert hasattr(model, "feature_thresholds_per_fold_") assert hasattr(model, "aggregate_thresholds_per_fold_") assert isinstance(model.feature_thresholds_, pd.Series) assert len(model.feature_thresholds_) == y.shape[1] assert all(model.feature_thresholds_.notna()) assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame) assert isinstance(model.aggregate_thresholds_per_fold_, dict)
def grid_search_gbr(xvar, yvar, n_estimators, max_depth, min_samples_leaf, min_samples_split, cv, n_iter): n_estimators = list(map(int, n_estimators.split(','))) max_depth = list(map(int, max_depth.split(','))) min_samples_leaf = list(map(int, min_samples_leaf.split(','))) min_samples_split = list(map(int, min_samples_split.split(','))) n_iter = int(n_iter) cv = int(cv) if yvar.shape[1] == 1: yvar_ravel = yvar.values.ravel() parameters = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split } gbr = GradientBoostingRegressor(random_state=42) if yvar.shape[1] != 1: yvar_ravel = yvar parameters = { 'estimator__n_estimators': n_estimators, 'estimator__max_depth': max_depth, 'estimator__min_samples_leaf': min_samples_leaf, 'estimator__min_samples_split': min_samples_split } gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=42), n_jobs=-1) ss = ShuffleSplit(n_splits=cv, test_size=0.25, random_state=42) random_cv = RandomizedSearchCV(estimator=gbr, param_distributions=parameters, cv=ss, n_iter=n_iter, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42) random_cv.fit(xvar, yvar_ravel) return list(random_cv.best_params_.items())
def make_bayesian_pred(df, next_week, debug=0): """ This method creates predictions using bayesian regression. """ space = { 'estimator__alpha_1': [1e-10, 1e-5, 1], 'estimator__alpha_2': [1e-10, 1e-5, 1], 'estimator__lambda_1': [1e-10, 1e-5, 1], 'estimator__lambda_2': [1e-10, 1e-5, 1], 'estimator__n_iter': [10, 300, 1000], 'estimator__normalize': [True, False], 'estimator__fit_intercept': [True, False] } params = { 'estimator__alpha_1': [1e-10, 1e-5, 1, 5], 'estimator__alpha_2': [1e-10, 1e-5, 1, 5], 'estimator__lambda_1': [1e-10, 1e-5, 1, 5], 'estimator__lambda_2': [1e-10, 1e-5, 1, 5], 'estimator__n_iter': [10, 300, 1000], 'estimator__normalize': [True, False], 'estimator__n_jobs': -1, 'n_jobs': -1, 'estimator__fit_intercept': [True, False] } X_train, X_test, Y_train, Y_test = process_data(df, next_week) multi_bay = MultiOutputRegressor(BayesianRidge()) #multi_bay.set_params(**params) #best_random = grid_search(multi_bay, space, next_week, 3, X_train, Y_train) multi_bay.fit(X_train, Y_train) next_week[Y_train.columns] = multi_bay.predict(next_week[X_train.columns]) if debug: y_pred_untrain = multi_bay.predict(X_train) print(next_week) print("Score: ", multi_bay.score(X_train, Y_train) * 100) print("MSE: ", metrics.mean_squared_error(Y_train, y_pred_untrain)) print( "CV: ", ms.cross_val_score(multi_bay, Y_train, y_pred_untrain, cv=10, scoring='neg_mean_squared_error')) return next_week
def test_multiple_treatments(self): np.random.seed(123) # Only applicable to continuous treatments # Generate data for 2 treatments TE = np.array([[TestOrthoForest._exp_te(x), TestOrthoForest._const_te(x)] for x in TestOrthoForest.X]) coefs_T = uniform(0, 1, size=(TestOrthoForest.support_size, 2)) T = np.matmul(TestOrthoForest.W[:, TestOrthoForest.support], coefs_T) + \ uniform(-1, 1, size=(TestOrthoForest.n, 2)) delta_Y = np.array([np.dot(TE[i], T[i]) for i in range(TestOrthoForest.n)]) Y = delta_Y + np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_Y) + \ TestOrthoForest.epsilon_sample(TestOrthoForest.n) # Test multiple treatments with controls est = ContinuousTreatmentOrthoForest(n_trees=50, min_leaf_size=10, max_depth=50, subsample_ratio=0.30, bootstrap=False, n_jobs=4, model_T=MultiOutputRegressor(Lasso(alpha=0.024)), model_Y=Lasso(alpha=0.024), model_T_final=WeightedLassoCVWrapper(), model_Y_final=WeightedLassoCVWrapper()) est.fit(Y, T, TestOrthoForest.X, TestOrthoForest.W) expected_te = np.array([TestOrthoForest.expected_exp_te, TestOrthoForest.expected_const_te]).T self._test_te(est, expected_te, tol=0.5, treatment_type='multi')
def fit( experiment, x_train, y_train, parameters, alpha: float = 0.5, delta_e_loss: bool = True, ): regressor = MultiOutputRegressor( LGBMRegressor(objective='quantile', alpha=alpha, **parameters)) if delta_e_loss: cv = cross_val_score(regressor, x_train, y_train, n_jobs=-1, scoring=scorer) else: cv = cross_val_score(regressor, x_train, y_train, n_jobs=-1) return np.abs(cv.mean()), cv.std()
def _approximate(self, X, y): """ """ if self.reg is not None: regressor = Ridge(alpha=self.reg, solver='auto', normalize=True, tol=1e-10) else: pass if self.target == 'multi': targets = MultiOutputRegressor(regressor).fit(X, y) elif self.target == 'variate': targets = regressor.fit(X, y) else: raise ValueError('') return targets
def grant_predictor(onu_id,onu_df,window,predict,features,model,metric): index=0 # window start index_max = 0 # prediction end # list with metrics of each prediction in different observation windows metric_list = [] reg = MultiOutputRegressor(model)#Implement the model while index+window < len(onu_df): interval=index+window # window final position df_tmp = onu_df.iloc[index:interval] # training dataset if interval+predict < len(onu_df): # check if prediction doesnt overflow input data index_max = interval+predict else: index_max = len(onu_df)-1 # check if features evaluated is simple(counter) else counter+timestamp if len(features) == 1: X_pred = np.array(onu_df[features].iloc[interval:index_max]).reshape(-1,1) if len(X_pred) == 0: break # fitting the model reg.fit(np.array( df_tmp[features] ).reshape(-1,1) , df_tmp[['start','end']]) else: X_pred = onu_df[features].iloc[interval:index_max] if len(X_pred) == 0: break # fitting the model reg.fit(df_tmp[features] , df_tmp[['start','end']]) # make prediction pred = reg.predict(X_pred) # real values to compare with prediction Y_true = onu_df[['start','end']].iloc[interval:index_max] # metric calculation metric_list.append(metric(Y_true, pred,multioutput='uniform_average')) # shift past observations window in p positions index += predict return metric_list
def train(alpha=0.5, delta_e_loss=True): # Config is a variable that holds and saves hyperparameters and inputs configs = { 'n_estimators': 100, 'max_depth': 10, 'num_leaves': 50, 'reg_alpha': 0.00001, 'reg_lambda': 0.00001, 'subsample': 0.2, 'colsample_bytree': 0.2, 'min_child_weight': 0.001, } # Initilize a new wandb run wandb.init(project='colorml', config=configs) config = wandb.config regressor = MultiOutputRegressor( LGBMRegressor(objective='quantile', alpha=alpha, **config)) if delta_e_loss: cv = cross_val_score(regressor, X_train, y_train, n_jobs=5, scoring=scorer, cv=5) else: cv = cross_val_score(regressor, X_train, y_train, n_jobs=2, cv=5) mean = np.abs(cv.mean()) std = np.abs(cv.std()) wandb.log({'cv_mean': mean}) wandb.log({'cv_std': std}) wandb.run.summary['cv_mean'] = mean wandb.run.summary['cv_std'] = std
def multiouput_regressor(input, target, input_test, target_test, output): # dataset X = input y = target X_test = input_test y_test = target_test estimator = LinearRegression() model = MultiOutputRegressor(estimator) # Perform 6-fold cross validation #scores = cross_val_score(model, X, y, cv=5) #print("Cross-validated scores: ") #print(scores) # Make cross validated predictions scores = cross_validate(model, X, y, cv=5, return_estimator=True) model2 = scores['estimator'][1] predictions = model2.predict(X_test) # Remove exterme values mask = predictions[:, 1] <= 1 y_test = y_test[mask] predictions = predictions[mask] accuracy = metrics.r2_score(y_test, predictions) print("Cross-Predicted Accuracy: {}".format(accuracy)) # The line / model fig, ax = plt.subplots() ax.scatter(y_test[:, 0], y_test[:, 1], color='red', alpha=0.5) ax.scatter(predictions[:, 0], predictions[:, 1], color='blue', alpha=0.5) ax.set_xlabel('P') ax.set_ylabel('Q') plt.show() np.savetxt(output, predictions)
def runBaseLineRegression(model_params,data,estimator): #regr = MultiOutputRegressor(sklearn.linear_model.LinearRegression()) regr = MultiOutputRegressor(estimator) #regr = MultiOutputRegressor(sklearn.linear_model.BayesianRidge()) #regr = MultiOutputRegressor(sklearn.linear_model.Lasso()) #data AP_train,TRP_train = data[0] AP_dev,TRP_dev = data[1] if model_params["DirectionForward"]: X_train,Y_train,X_dev,Y_dev = TRP_train,AP_train,TRP_dev,AP_dev else: X_train,Y_train,X_dev,Y_dev = AP_train,TRP_train,AP_dev,TRP_dev model_params["OutputNames"],model_params["InputNames"] = model_params["InputNames"],model_params["OutputNames"] regr.fit(X_train,Y_train) Y_dev_pred = regr.predict(X_dev) Y_train_pred = regr.predict(X_train) if model_params["DirectionForward"]: #train mse_totoal_train = customUtils.mse_p(ix = (3,6),Y_pred = Y_train_pred,Y_true = Y_train) #dev mse_totoal_dev = customUtils.mse_p(ix = (3,6),Y_pred = Y_dev_pred,Y_true = Y_dev) else: mse_totoal_train = mse(Y_train,Y_train_pred,multioutput = 'raw_values') mse_totoal_dev = mse(Y_dev,Y_dev_pred,multioutput = 'raw_values') model_location = os.path.join('models',model_params["model_name"] + '.json') with open(os.path.join('model_params',model_params["model_name"] + '.json'), 'w') as fp: json.dump(model_params, fp, sort_keys=True) _ = run_eval_base(model_location,dataset = "train",email = model_params["email"]) _ = run_eval_base(model_location,dataset = "test",email = model_params["email"]) mse_total = run_eval_base(model_location,dataset = "dev",email = model_params["email"]) return (mse_totoal_train.tolist(),mse_totoal_dev.tolist(),mse_totoal_train.sum(),mse_totoal_dev.sum())
def _init_gbd(self): cv_params = { 'estimator__n_estimators': [500, 800, 1000, 1600, 2400], 'estimator__max_depth': [3, 6, 8, 10] } # other_params = { 'learning_rate': self.learning_rate, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1 } self.model = xgb.XGBRegressor(**other_params) self.model = MultiOutputRegressor(self.model) self.best_model = GridSearchCV(estimator=self.model, param_grid=cv_params, scoring='r2', cv=5, verbose=2)
def test_diff_detector_require_thresholds(require_threshold: bool): """ Should fail if requiring thresholds, but not calling cross_validate """ X = pd.DataFrame(np.random.random((100, 5))) y = pd.DataFrame(np.random.random((100, 2))) model = DiffBasedAnomalyDetector( base_estimator=MultiOutputRegressor(LinearRegression()), require_thresholds=require_threshold, ) model.fit(X, y) if require_threshold: # FAIL: Forgot to call .cross_validate to calculate thresholds. with pytest.raises(AttributeError): model.anomaly(X, y) model.cross_validate(X=X, y=y) model.anomaly(X, y) else: # thresholds not required model.anomaly(X, y)
def neural_network(num_of_layers, is_multi=False): print("getting training data") X, usa_gross, _ = get_set("training") linear = get_linear_fit(X, usa_gross, [default_alpha_linear])[0] training_res = [] test_res = [] for _ in range(5): X, usa_gross, rating = get_set("training") X_picked = pick_needed_features(linear, X) # X_picked = X net = MLPRegressor(hidden_layer_sizes=(100, )*num_of_layers) net = MultiOutputRegressor(net) if is_multi: net.fit(X_picked, list(zip(usa_gross, rating))) else: net.fit(X_picked, list(zip(usa_gross))) predicts = net.predict(X_picked) training_res.append(mean_squared_error(predicts[:, 0], usa_gross)) X, usa_gross, rating = get_set("validation") X_picked = pick_needed_features(linear, X) # X_picked = X predicts = net.predict(X_picked) test_res.append(mean_squared_error(predicts[:, 0], usa_gross)) return np.mean(training_res), np.std(training_res), np.mean(test_res), np.std(test_res)
def get_model() -> AdaBoostRegressor: """ Full pipeline for getting trained AdaBoostRegressor model Returns: clf:AdaBoostRegressor """ # Read dataframes and drop excess columns and bad images # Special dataframe of our handmarked labels drop_columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"] drop_images = [104, 908, 906, 907, 905, 904] + list(range(905, 1000)) df_augmented = read_data_frame(HAND_MARKED_LABELS, drop_columns, drop_images) # precompute stats and brute force (l,r) boundaries images_aug_, labels_aug_, imageids_aug_ = precompute_stats(df_augmented) imageids_aug_ = df_augmented.ImageId.to_numpy() labels_aug_ = np.array(labels_aug_) bounds_aug_, train_x_aug_, train_y_aug_ = brute_force_bounds( images_aug_, labels_aug_, imageids_aug_ ) # images on which we tested model test_ids = [776, 675, 42, 3, 714, 312, 127, 653, 592, 205, 179, 191] test_indices = np.in1d(df_augmented.ImageId.to_numpy(), test_ids).nonzero()[0] # delete test images from train deleted_test_x, deleted_test_y = ( np.delete(train_x_aug_, test_indices, axis=0), np.delete(train_y_aug_, test_indices, axis=0), ) x_train_aug = deleted_test_x[:] y_train_aug = deleted_test_y[:] # train clf = MultiOutputRegressor(AdaBoostRegressor(random_state=10, n_estimators=5)).fit( x_train_aug, y_train_aug ) return clf
def train_model(self, params): ''' Input a dict, params, containing: nu: Float, fraction of support vectors (0,1] C: Float, penalty parameter of error (~1.0) kernel: String, 'linear', 'poly', 'rbf', sigmoid' degree: Int, degree of polynomial for poly gamma: String, 'scale'/'auto' for 'rbf', 'poly', 'sigmoid' Returns: Dict containing info on combination ''' kernel = params['kernel'] nu = params['nu'] C = params['C'] # Instantiate SVR if kernel in ['linear']: model = MOR(NuSVR(C=C, nu=nu, kernel=kernel)) elif kernel in ['rbf', 'sigmoid']: gamma = params['gamma'] model = MOR(NuSVR(C=C, nu=nu, kernel=kernel, gamma=gamma)) elif kernel in ['poly']: gamma = params['gamma'] degree = params['degree'] model = MOR( NuSVR(C=C, nu=nu, kernel=kernel, degree=degree, gamma=gamma)) # Print current combination print('Current SVR combination: {}'.format(params)) # Flat versions of y (power/flux distribution) y_tr_fl, y_te_fl = self.flat_y() # Fit model.fit(self.x_train, y_tr_fl) # Hyperopt loss for each combination y_predict = model.predict(self.x_test) hyp_loss = sklmse(y_te_fl, y_predict) self.tr_hist.update_history(params, hyp_loss, model) return {'loss': hyp_loss, 'status': STATUS_OK}
mplpyplot.show() # nodebox section end # Create a random dataset rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(600, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T y += (0.5 - rng.rand(*y.shape)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=400, random_state=4) max_depth = 30 regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth, random_state=0)) regr_multirf.fit(X_train, y_train) regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2) regr_rf.fit(X_train, y_train) # Predict on new data y_multirf = regr_multirf.predict(X_test) y_rf = regr_rf.predict(X_test) # Plot the results plt.figure() s = 50 a = 0.4 plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k', c="navy", s=s, marker="s", alpha=a, label="Data")
feature = "Diabetes" # get X and y data train = pd.read_csv("train.csv", delimiter=",") train = train.drop_duplicates() # ensure no duplicates y_train = train[feature].to_frame() names = y_train[feature].unique() X_train = train.drop(feature, 1) X_names = list(X_train) # Get test data test = pd.read_csv("test.csv", delimiter=",") X_test = test max_depth = 3 regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth)) regr_multirf.fit(X_train, y_train) regr_rf = RandomForestRegressor(n_estimators=20, max_depth=max_depth) regr_rf.fit(X_train, y_train) # Predict on new data y_multirf = regr_multirf.predict(X_test) y_rf = regr_rf.predict(X_test) # put predictions into csv IDs = pd.DataFrame(X_test["ID"]) y_pred = pd.DataFrame(y_multirf) pred_data = IDs.join(y_pred) pred_data.columns = ['ID', 'Prediction'] pred_data.to_csv(path_or_buf="prediction_multirf.csv", index=False)