def OMP_cv(problem, **kwargs): r"""High level description. Requirements ------------ kwargs['choose'] must be a positive integer kwargs['coef_tolerance'] must be a nonnegative float Returns ------- output : tuple (optimum, maximum) """ data_list = [datum['data']['values'] for datum in problem.data] data = numpy.array(data_list) OMP = OrthogonalMatchingPursuitCV(max_iter=kwargs['choose']) OMP.fit(data.T, problem.goal['data']['values']) OMP_coefficients = OMP.coef_ optimum = [ problem.data[index] for index, element in enumerate(OMP_coefficients) if abs(element) > kwargs['coef_tolerance'] ] maximum = OMP.score(data.T, problem.goal['data']['values']) output = (optimum, maximum) return output
def plot_omp(): n_components, n_features = 512, 100 n_nonzero_coefs = 17 # generate the data # y = Xw # |x|_0 = n_nonzero_coefs y, X, w = make_sparse_coded_signal(n_samples=1, n_components=n_components, n_features=n_features, n_nonzero_coefs=n_nonzero_coefs, random_state=0) idx, = w.nonzero() # distort the clean signal y_noisy = y + 0.05 * np.random.randn(len(y)) # plot the sparse signal plt.figure(figsize=(7, 7)) plt.subplot(4, 1, 1) plt.xlim(0, 512) plt.title("Sparse signal") plt.stem(idx, w[idx], use_line_collection=True) # plot the noise-free reconstruction omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs) omp.fit(X, y) coef = omp.coef_ idx_r, = coef.nonzero() plt.subplot(4, 1, 2) plt.xlim(0, 512) plt.title("Recovered signal from noise-free measurements") plt.stem(idx_r, coef[idx_r], use_line_collection=True) # plot the noisy reconstruction omp.fit(X, y_noisy) coef = omp.coef_ idx_r, = coef.nonzero() plt.subplot(4, 1, 3) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements") plt.stem(idx_r, coef[idx_r], use_line_collection=True) # plot the noisy reconstruction with number of non-zeros set by CV omp_cv = OrthogonalMatchingPursuitCV() omp_cv.fit(X, y_noisy) coef = omp_cv.coef_ idx_r, = coef.nonzero() plt.subplot(4, 1, 4) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements with CV") plt.stem(idx_r, coef[idx_r], use_line_collection=True) plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38) plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit', fontsize=16) plt.show()
def test_omp_cv(): y_ = y[:, 0] gamma_ = gamma[:, 0] ompcv = OrthogonalMatchingPursuitCV(normalize=True, fit_intercept=False, max_iter=10, cv=5) ompcv.fit(X, y_) assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs) assert_array_almost_equal(ompcv.coef_, gamma_) omp = OrthogonalMatchingPursuit(normalize=True, fit_intercept=False, n_nonzero_coefs=ompcv.n_nonzero_coefs_) omp.fit(X, y_) assert_array_almost_equal(ompcv.coef_, omp.coef_)
class _OrthogonalMatchingPursuitCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_omp_cv(): # FIXME: This test is unstable on Travis, see issue #3190 for more detail. check_skip_travis() y_ = y[:, 0] gamma_ = gamma[:, 0] ompcv = OrthogonalMatchingPursuitCV(normalize=True, fit_intercept=False, max_iter=10, cv=5) ompcv.fit(X, y_) assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs) assert_array_almost_equal(ompcv.coef_, gamma_) omp = OrthogonalMatchingPursuit(normalize=True, fit_intercept=False, n_nonzero_coefs=ompcv.n_nonzero_coefs_) omp.fit(X, y_) assert_array_almost_equal(ompcv.coef_, omp.coef_)
def train_regression_model(X, y, model_type='elastic cv', cv=3, extra_params={}): '''Wrapper function to train various regression models with X,y input, where extra params can be passed to override any default parameters''' model_type = model_type.lower() if model_type == 'linear': model = LinearRegression(fit_intercept=True) elif model_type == 'elastic cv': model = ElasticNetCV(cv=cv) elif model_type == 'omp cv': model = OrthogonalMatchingPursuitCV(cv=cv) elif model_type == 'lars cv': model = LarsCV(cv=cv) elif model_type == 'ridge cv': model = RidgeCV(cv=cv) elif model_type == 'full lightgbm': model = Train_Light_GBM(X, y, int_cv=cv, regression=True, **extra_params) return model model.fit(X, y) return model
def createOrthogonalMatchingPursuitRegressor(params=None): info("Creating Orthogonal Matching Pursuit Regressor", ind=4) ## Params params = mergeParams(OrthogonalMatchingPursuit(), params) params = mergeParams(OrthogonalMatchingPursuitCV(), params) tuneParams = getOrthogonalMatchingPursuitRegressorParams() ## estimator if params.get('cv') is True: info("Using Built-In Cross Validation With Parameters", ind=4) reg = OrthogonalMatchingPursuitCV() else: info("Without Parameters", ind=4) reg = OrthogonalMatchingPursuit() return {"estimator": reg, "params": tuneParams}
def get_model_by_name(model_name): return { 'Linear Regression': LinearRegression(), 'Lars CV': LarsCV(cv=10), 'Lasso CV': LassoCV(cv=10), 'Ridge CV': RidgeCV(cv=10), 'Elastic Net CV': ElasticNetCV(cv=10), 'Orthogonal Matching Pursuit CV': OrthogonalMatchingPursuitCV(cv=10), 'Decision Tree Regressor': DecisionTreeRegressor(max_depth=3), }[model_name]
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ n_nonzero_coefs = 17 algorithm = OrthogonalMatchingPursuitCV() algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def test_model_orthogonal_matching_pursuit_cv(self): model, X = fit_regression_model(OrthogonalMatchingPursuitCV()) model_onnx = convert_sklearn( model, "orthogonal matching pursuit cv", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnOrthogonalMatchingPursuitCV-Dec4")
def get_feature_coefficients(self, norm_prior=1): """ get feature coefficients using linear regression. Linear models penalized with the L1 norm have sparse solutions: many of their estimated coefficients are zero. Args: norm_prior: 1 for L1-norm as default. use L0 to get the sparsest result. """ model = None alphas = np.logspace(-4, -0.5, 30) tuned_parameters = [{'alpha': alphas}] coefficient_value = None if norm_prior == 0: # L0-norm model = OrthogonalMatchingPursuitCV() model.fit(self.X_df.values, self.y_df.values) coefficient_value = model.coef_ elif norm_prior == 1: # L1-norm # Lasso lasso = Lasso(random_state=0) n_folds = 3 gridsearch = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False) gridsearch.fit(self.X_df.values, self.y_df.values) coefficient_value = gridsearch.best_estimator_.coef_ elif norm_prior == 2: # L2-norm # Ridge ridge = Ridge(random_state=0) n_folds = 3 gridsearch = GridSearchCV(ridge, tuned_parameters, cv=n_folds, refit=False) gridsearch.fit(self.X_df.values, self.y_df.values) coefficient_value = gridsearch.best_estimator_.coef_ else: print("invalid norm!") self.coef_ = coefficient_value return coefficient_value
def solve_preconditioned_orthogonal_matching_pursuit(basis_matrix_func, samples, values, precond_func, tol=1e-8): basis_matrix = basis_matrix_func(samples) weights = precond_func(basis_matrix, samples) basis_matrix = basis_matrix * weights[:, np.newaxis] rhs = values * weights[:, np.newaxis] if basis_matrix.shape[1] == 1 or tol > 0: omp = OrthogonalMatchingPursuit(tol=tol) else: omp = OrthogonalMatchingPursuitCV(cv=min(samples.shape[1], 10)) res = omp.fit(basis_matrix, rhs) coef = omp.coef_ coef[0] += res.intercept_ return coef[:, np.newaxis]
def train_regression_model(X, y, model_type='elastic', cv=3): if model_type == 'linear': model = LinearRegression(fit_intercept=True) elif model_type == 'elastic cv': model = ElasticNetCV(cv=cv) elif model_type == 'omp cv': model = OrthogonalMatchingPursuitCV(cv=cv) elif model_type == 'lars cv': model = LarsCV(cv=cv) elif model_type == 'ridge cv': model = RidgeCV(cv=cv) elif model_type == 'simple xgboost': model = XGBRegressor() elif model_type == 'simple lightgbm': model = LGBMRegressor() elif model_type == 'full lightgbm': model = train_light_gbm_regressor(X, y, cv, n_params=10, test_size=.2) return model model.fit(X, y) return model
def fit_linear_model(basis_matrix, train_vals, solver_type, **kwargs): solvers = { 'lasso_lars': LassoLarsCV(cv=kwargs['cv']).fit, 'lasso': LassoCV(cv=kwargs['cv']).fit, 'lars': LarsCV(cv=kwargs['cv']).fit, 'omp': OrthogonalMatchingPursuitCV(cv=kwargs['cv'], verbose=5).fit } assert train_vals.ndim == 2 if solver_type in solvers: fit = solvers[solver_type] res = fit(basis_matrix, train_vals[:, 0]) else: msg = f'Solver type {solver_type} not supported\n' msg += 'Supported solvers are:\n' for key in solvers.keys(): msg += f'\t{key}\n' raise Exception(msg) cv_score = res.score(basis_matrix, train_vals[:, 0]) coef = res.coef_[:, np.newaxis] coef[0] = res.intercept_ return coef, cv_score
def check_w(w=[12, 24, 36, 48, 60]): ''' robustness check for w_min, save the prediction results (Avew window) and OOS R_square Parameters ---------- w: possible w_min (list) ''' for w_min in w: #linear ML prediction pre1 = linear_prediction(RidgeCV(), w_min=w_min, window_type="Avew") pre2 = linear_prediction(LassoCV(cv=5), w_min=w_min, window_type="Avew") pre3 = linear_prediction(ElasticNetCV(cv=5), w_min=w_min, window_type="Avew") pre4 = linear_prediction(LarsCV(cv=5), w_min=w_min, window_type="Avew") pre5 = linear_prediction(OrthogonalMatchingPursuitCV(cv=5), w_min=w_min, window_type="Avew") pre6 = MR(w_min=w_min, window_type="Avew") all_pre = pd.DataFrame({ 'Kintchen Sink': pre6, "ridge": pre1, "lasso": pre2, "elasticnet": pre3, "lars": pre4, "OMP": pre5, }) all_pre['FC'] = all_pre.iloc[:, 1:].mean(axis=1) #save the prediction results all_pre.to_csv( os.path.join(path, "稳健性检验", "w_min", "预测结果", "w_min=" + str(w_min) + ".csv")) #R2 test R2_test(all_pre, name="w_min=" + str(w_min) + ".csv") #then you need move the result on your own
def choose_ML_alg(self): models = [ RANSACRegressor(), HuberRegressor(), LinearRegression(), ElasticNet(), ElasticNetCV(), Lars(), Lasso(), LassoLars(), LassoLarsIC(), OrthogonalMatchingPursuit(), OrthogonalMatchingPursuitCV(), Ridge(), SGDRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), AdaBoostRegressor(), NGBRegressor(Dist=Normal), DecisionTreeRegressor() ] return models
def _ompcv(*, train, test, x_predict=None, metrics, copy=True, fit_intercept=True, normalize=True, max_iter=None, cv=None, n_jobs=None, verbose=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.OrthogonalMatchingPursuitCV.html#sklearn.linear_model.OrthogonalMatchingPursuitCV """ model = OrthogonalMatchingPursuitCV(fit_intercept=fit_intercept, copy=copy, normalize=normalize, max_iter=max_iter, cv=cv, n_jobs=n_jobs, verbose=verbose) model.fit(train[0], train[1]) model_name = 'OrthogonalMatchingPursuitCV' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
df = test_regressor( MLPRegressor(random_state=1, activation='logistic', solver='sgd', learning_rate='adaptive', learning_rate_init=0.013000000000000001, early_stopping=True, hidden_layer_sizes=(140, 140), max_iter=10000, momentum=0.9697272727272728), df) df = test_regressor(LassoCV(cv=5), df) df = test_regressor(LassoLarsCV(cv=5), df) df = test_regressor(RidgeCV(cv=5), df) df = test_regressor(LinearRegression(), df) df = test_regressor(ElasticNetCV(cv=5), df) df = test_regressor(OrthogonalMatchingPursuitCV(cv=5), df) df = test_regressor(ARDRegression(compute_score=True, copy_X=True), df) # test_regressor(LogisticRegressionCV(cv=5)) - it's used for classification df = test_regressor(SGDRegressor(), df) df = test_regressor(PassiveAggressiveRegressor(), df) df = test_regressor(RANSACRegressor(), df) df = test_regressor(TheilSenRegressor(copy_X=True), df) df = test_regressor(HuberRegressor(), df) df = test_regressor(AdaBoostRegressor(n_estimators=1000), df) df = test_regressor(BaggingRegressor(n_estimators=1000), df) df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df) df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df) df = test_regressor(RandomForestRegressor(n_estimators=1000), df) df = test_regressor(GaussianProcessRegressor(), df) # df = test_regressor(IsotonicRegression(), df) - has errors df = test_regressor(LinearSVR(), df)
dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn, algname, modelCV) dataset_l += dat_l algoritmo_l += alg_l fold_l += f_l mae_l += ab_l rmse_l += sq_l cplx_l += cp_l mse = np.mean(sq_l) print(f'{algname}: {mse}') print('\n') algname = 'IT-ELM (OMP)' modelCV = OrthogonalMatchingPursuitCV(n_jobs=-1) dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn, algname, modelCV) dataset_l += dat_l algoritmo_l += alg_l fold_l += f_l mae_l += ab_l rmse_l += sq_l cplx_l += cp_l mse = np.mean(sq_l) print(f'{algname}: {mse}') print('\n')
coef = omp.coef_ (idx_r, ) = coef.nonzero() plt.subplot(4, 1, 2) plt.xlim(0, 512) plt.title("Recovered signal from noise-free measurements") plt.stem(idx_r, coef[idx_r], use_line_collection=True) # plot the noisy reconstruction omp.fit(X, y_noisy) coef = omp.coef_ (idx_r, ) = coef.nonzero() plt.subplot(4, 1, 3) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements") plt.stem(idx_r, coef[idx_r], use_line_collection=True) # plot the noisy reconstruction with number of non-zeros set by CV omp_cv = OrthogonalMatchingPursuitCV(normalize=False) omp_cv.fit(X, y_noisy) coef = omp_cv.coef_ (idx_r, ) = coef.nonzero() plt.subplot(4, 1, 4) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements with CV") plt.stem(idx_r, coef[idx_r], use_line_collection=True) plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38) plt.suptitle("Sparse signal recovery with Orthogonal Matching Pursuit", fontsize=16) plt.show()
explain_weights(clf, unknown_argument=True) @pytest.mark.parametrize(['reg'], [ [ElasticNet(random_state=42)], [ElasticNetCV(random_state=42)], [HuberRegressor()], [Lars()], [LarsCV(max_n_alphas=10)], [Lasso(random_state=42)], [LassoCV(random_state=42)], [LassoLars(alpha=0.01)], [LassoLarsCV(max_n_alphas=10)], [LassoLarsIC()], [OrthogonalMatchingPursuit(n_nonzero_coefs=10)], [OrthogonalMatchingPursuitCV()], [PassiveAggressiveRegressor(C=0.1, random_state=42)], [Ridge(random_state=42)], [RidgeCV()], [SGDRegressor(random_state=42)], [LinearRegression()], [LinearSVR(random_state=42)], [TheilSenRegressor(random_state=42)], ]) def test_explain_linear_regression(boston_train, reg): assert_explained_weights_linear_regressor(boston_train, reg) @pytest.mark.parametrize(['reg'], [ [Lasso(random_state=42)], [Lasso(fit_intercept=False, random_state=42)],
'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'Medu', 'famsup' ] # convert categorical data to dummy variables student_data = handle_cat_data(cat_data, student_data) # split testing and training data X_train, X_test, y_train, y_test = train_test_split( student_data.drop('failures', axis=1), student_data.failures, test_size=0.25, stratify=student_data.failures) reg_algs_names = [ 'Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Elastic Net Regression', 'Orthongonal Matching Pursuit CV', 'MLP Regressor' ] reg_algs = [ LinearRegression(normalize=True), Ridge(alpha=0, normalize=True), Lasso(alpha=0.01, normalize=False), ElasticNet(random_state=0), OrthogonalMatchingPursuitCV(cv=8, normalize=True), MLPRegressor(max_iter=1000) ] run_reg_models(reg_algs_names, reg_algs, X_train, X_test, y_train, y_test)
def run(seed): # create folders for scores models and preds folder_models = './models/age/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['age'].copy().reset_index(drop=True) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge(), HuberRegressor(epsilon=2.5, alpha=1), OrthogonalMatchingPursuit(n_nonzero_coefs=300) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 5, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 5, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['RGF', 'ENet', 'BRidge'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') print('Reading scores from ', folder_preds) # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train[prefix + '_score'] = np.load(folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test[prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list( set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_cols=pca_cols) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # dl dict_cols = sorted( list( set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) - set(['IC_20']))) train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols) # learning process on different datasets names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ MLPRegressor(activation='tanh', random_state=0), RGFRegressor(max_leaf=1500, loss='Abs'), NuSVR(C=10, nu=0.4, kernel='rbf'), BayesianRidge(), OrthogonalMatchingPursuitCV(), ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0), KernelRidge(kernel='poly', alpha=0.5) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 + [train_dl] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 + [test_dl] * 2, names, is_blend=False) # rewrite folders for models and preds folder_models = './models/age/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') folds = KFold(n_splits=10, shuffle=True, random_state=0) stack = pd.DataFrame(zoo.oof_preds).T stack.columns = names model_stacker_rgf = RGFRegressor(max_leaf=1000, reg_depth=25, verbose=False) rgf_pred = cross_val_predict(model_stacker_rgf, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_br = BayesianRidge() br_pred = cross_val_predict(model_stacker_br, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_rgf.fit(stack, y.dropna()) model_stacker_br.fit(stack, y.dropna()) # save models save_pickle(model_stacker_br, folder_models + 'BRidge_stack_seed{}'.format(seed)) save_pickle(model_stacker_rgf, folder_models + 'RGF_stack_seed{}'.format(seed)) print('Final age NMAE: {:.5f}'.format( NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred))) test_preds = pd.DataFrame(preds).T test_preds.columns = names age_prediction = pd.DataFrame() age_prediction['Id'] = test['Id'].values age_prediction['pred'] = 0.25 * model_stacker_rgf.predict( test_preds) + 0.75 * model_stacker_br.predict(test_preds) age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed), index=False) print('age seed pred is saved as', folder_preds + 'age_stack_seed{}.csv'.format(seed))
def run(seed): # create folders for scores models and preds folder_models = './models/domain1_var1/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain1_var1/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['domain1_var1'].copy().reset_index(drop=True) d11_index = list(train['domain1_var1'].dropna().index) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') print('Reading scores from ', folder_preds) # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train.loc[d11_index, prefix + '_score'] = np.load( folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test.loc[:, prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list(set(ic_cols + fnc_cols + pca_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_factor=0.2, scale_cols=pca_cols, sc=MinMaxScaler()) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # learning process on different datasets names = ['GP', 'SVM1', 'SVM2', 'OMP', 'KR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ GaussianProcessRegressor(DotProduct(), random_state=0), NuSVR(C=5, kernel='rbf'), NuSVR(C=5, kernel='rbf'), OrthogonalMatchingPursuitCV(), KernelRidge(kernel='poly', degree=2, alpha=10) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2, names, is_blend=True) # rewrite folders for models and preds folder_models = './models/domain1_var1/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain1_var1/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') d11_prediction = pd.DataFrame() d11_prediction['Id'] = test['Id'].values d11_prediction['pred'] = preds d11_prediction.to_csv(folder_preds + 'domain1_var1_stack_seed{}.csv'.format(seed), index=False) print('domain1_var1 seed pred is saved as', folder_preds + 'domain1_var1_stack_seed{}.csv'.format(seed))
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
models_summary.append(evaluate_model('Ridge', Ridge(alpha=alpha, max_iter=max_iter))) models_summary.append(evaluate_model('Ridge CV', RidgeCV(alphas=alphas))) models_summary.append(evaluate_model('Kernel Ridge', KernelRidge(alpha=alpha))) models_summary.append(evaluate_model('Elastic Net', ElasticNet(alpha=alpha, max_iter=max_iter))) models_summary.append(evaluate_model('Elastic Net CV', ElasticNetCV(alphas=alphas, max_iter=max_iter))) models_summary.append(evaluate_model('Bayesian Ridge', BayesianRidge(n_iter=max_iter))) models_summary.append(evaluate_model('Orthogonal Matching Pursuit', OrthogonalMatchingPursuit())) models_summary.append(evaluate_model('Orthogonal Matching Pursuit CV', OrthogonalMatchingPursuitCV())) print('Models sorted by confidence') for model_summary in sorted(models_summary, key=itemgetter('confidence'), reverse=True): print('| {} | {}% | {} | {} | {} |'.format( model_summary['name'], round(model_summary['confidence'], 4), round(model_summary['mae'], 3), round(model_summary['mse'], 3), round(model_summary['rmse'], 3), )) print('Models sorted by RSME') for model_summary in sorted(models_summary, key=itemgetter('rmse')): print('| {} | {}% | {} | {} | {} |'.format( model_summary['name'],
idx_r, = coef.nonzero() plt.subplot(4, 1, 2) plt.xlim(0, 512) plt.title("Recovered signal from noise-free measurements") plt.stem(idx_r, coef[idx_r]) # plot the noisy reconstruction ############################### omp.fit(X, y_noisy) coef = omp.coef_ idx_r, = coef.nonzero() plt.subplot(4, 1, 3) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements") plt.stem(idx_r, coef[idx_r]) # plot the noisy reconstruction with number of non-zeros set by CV ################################################################## omp_cv = OrthogonalMatchingPursuitCV() omp_cv.fit(X, y_noisy) coef = omp_cv.coef_ idx_r, = coef.nonzero() plt.subplot(4, 1, 4) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements with CV") plt.stem(idx_r, coef[idx_r]) plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38) plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit', fontsize=16) plt.show()
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
pl.subplot(4, 1, 2) pl.xlim(0, 512) pl.title("Recovered signal from noise-free measurements") pl.stem(idx_r, coef[idx_r]) # plot the noisy reconstruction ############################### omp.fit(X, y_noisy) coef = omp.coef_ idx_r, = coef.nonzero() pl.subplot(4, 1, 3) pl.xlim(0, 512) pl.title("Recovered signal from noisy measurements") pl.stem(idx_r, coef[idx_r]) # plot the noisy reconstruction with number of non-zeros set by CV ################################################################## omp_cv = OrthogonalMatchingPursuitCV() omp_cv.fit(X, y_noisy) coef = omp_cv.coef_ idx_r, = coef.nonzero() pl.subplot(4, 1, 4) pl.xlim(0, 512) pl.title("Recovered signal from noisy measurements with CV") pl.stem(idx_r, coef[idx_r]) pl.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38) pl.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit', fontsize=16) pl.show()
build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto") build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto") build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto") build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto") build_auto(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(cv = 3), "LarsAuto") build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto") build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto") build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto") build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto") build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto") build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto") if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto") build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto") def build_auto_isotonic(regressor, auto_isotonic_X, name): pipeline = PMMLPipeline([ ("regressor", regressor) ])
def RunMP(aligned_data_root_path, output_path): do_compute_individual_k_motifs = True do_compute_anchored_chains = False do_compute_semantic_segmentation = False do_compute_multimodal_mp = False window_size = 1300 #window_size = 1500 data_dict = LoadAlignedTILESData(aligned_data_root_path) #plt.ion() pids = list(data_dict.keys())[0:1] streams = ['HeartRatePPG', 'StepCount'] # Compute motifs from the individual MP using a greedy method if do_compute_individual_k_motifs: num_motifs = 2 for pid in pids: fitbit_df = data_dict[pid]['fitbit'] fitbit_df = fitbit_df.iloc[0:10000, :] # HACK for stream in streams: exclusion_signal = fitbit_df[stream].copy() # Keep a NaN'd version for MP and interpolated one for OMP #nan_replace_value = -1000000 #fitbit_df[stream][np.isnan(fitbit_df[stream])] = nan_replace_value #fitbit_df_smooth = fitbit_df[stream].interpolate(method='linear', axis=0, inplace=False) #fitbit_df_smooth = fitbit_df[stream].copy() fitbit_df_smooth = exclusion_signal.copy() if np.isnan(fitbit_df_smooth[0] ): # Fill NaNs at the beginning and end idx = 0 while np.isnan(fitbit_df_smooth[idx]): idx += 1 fitbit_df_smooth[0:idx] = fitbit_df_smooth[idx] if np.isnan(fitbit_df_smooth[fitbit_df_smooth.shape[0] - 1]): idx = fitbit_df_smooth.shape[0] - 1 while np.isnan(fitbit_df_smooth[idx]): idx -= 1 fitbit_df_smooth[idx:] = fitbit_df_smooth[idx] # Use Matrix Profile methods to learn a motif dictionary motifs = [] while len(motifs) < num_motifs: #fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) # TODO - use the exclusion_signal fitbit_mp = stumpy.stump( exclusion_signal, m=window_size) # TODO - use the exclusion_signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(exclusion_signal. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 5.0 * window_size / 6.0: continue if stream == 'HeartRatePPG': pass break motif_left_idx = fitbit_mp_argsort[motif_idx] motif = fitbit_df_smooth[motif_left_idx:motif_left_idx + window_size] motif[motif == 0] = 1e-12 # OMP requires non-zeros in the support motifs.append(motif) plt.plot(range(motif_left_idx, motif_left_idx + window_size), motifs[-1], 'g-', linewidth=5) # Build a redundant dictionary from the motifs num_repetitions = len(fitbit_df_smooth) - window_size dictionary_mat = csr_matrix( (len(motifs) * num_repetitions, len(fitbit_df_smooth))) for motif_idx in range(len(motifs)): motif_values = motifs[motif_idx].values for repeat_idx in range(num_repetitions): # SLOW: TODO - find better way of generating this matrix. Maybe I can change the sparse encoding directly and just push extra zeros in front of the motif sequence? Better yet, why not abandon the matrix representation and just use a list of motifs and their starting index in the signal dictionary_mat[motif_idx * num_repetitions + repeat_idx, repeat_idx:repeat_idx + window_size] = motif_values # Reconstruct the signal using the motif dictionary # TODO : Write my own OMP with exclusion of each atom's support. Gram mat? # TODO : Use L1 optimization (Lasso)? #omp = OrthogonalMatchingPursuit(n_nonzero_coefs=2, fit_intercept=False) omp = OrthogonalMatchingPursuitCV(fit_intercept=False) omp.fit(dictionary_mat.T, fitbit_df_smooth) intercept = omp.intercept_ coef = omp.coef_ idx_r = coef.nonzero() num_nonzero = omp.n_nonzero_coefs_ #max_nonzero = 20 #skip_nan_percent = 0.1 #coef = np.zeros((dictionary_mat.T.shape[1],1)) #intercept = np.zeros((dictionary_mat.T.shape[0],1)) #for num_nonzero in range(1,max_nonzero+1): # # Reconstruct the signal using the motif dictionary # best_dict_idx = -1 # best_error = np.inf # best_dict_support = None # for dict_idx in range(dictionary_mat.shape[0]): # # SLOW # dict_vec = dictionary_mat[dict_idx,:].toarray().reshape(-1,) # # Find the support # left_support_idx = 0 # right_support_idx = len(dict_vec)-1 # while dict_vec[left_support_idx] == 0 and left_support_idx < len(dict_vec): # left_support_idx += 1 # while dict_vec[right_support_idx] == 0 and right_support_idx >= 0: # right_support_idx -= 1 # # Skip mostly NaN regions # if np.sum(np.isnan(exclusion_signal[left_support_idx:right_support_idx+1])) > skip_nan_percent*(right_support_idx-left_support_idx+1): # continue # # Find the best match # residual = exclusion_signal[left_support_idx:right_support_idx+1] - dict_vec[left_support_idx:right_support_idx+1] # np.nan_to_num(residual, copy=False) # Replace NaN with zero # error = np.dot(residual, residual) # if error < best_error: # best_error = error # coef_val = 1 # TODO - constrain between 0.5 and 2? # best_dict_idx = dict_idx # best_dict_support = (left_support_idx, right_support_idx) # if best_dict_idx < 0: # print("No best next dictionary element found") # break # # Update coef # coef_nonzero = (coef != 0).reshape(-1,) # if np.sum(coef_nonzero) > 0: # dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] # coef_reduced = coef[coef_nonzero] # #prev_fit_signal = np.matmul(dictionary_mat.T, coef) # prev_fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # prev_residual = fitbit_df_smooth - prev_fit_signal.reshape(-1,) # np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero # prev_error = np.dot(prev_residual, prev_residual) # coef[best_dict_idx] = coef_val # #fit_signal = np.matmul(dictionary_mat.T, coef) # fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,) # np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero # fit_error = np.dot(fit_residual, fit_residual) # else: # prev_residual = fitbit_df_smooth- np.zeros(len(fitbit_df_smooth)) # np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero # prev_error = np.dot(prev_residual, prev_residual) # coef[best_dict_idx] = coef_val # coef_nonzero = (coef != 0).reshape(-1,) # dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] # coef_reduced = coef[coef_nonzero] # fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,) # np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero # fit_error = np.dot(fit_residual, fit_residual) # if best_dict_support is not None: # exclusion_signal[best_dict_support[0]:best_dict_support[1]+1] = np.inf # if prev_error < fit_error: # print("Avoiding overfitting...") # coef[best_dict_idx,0] = 0 # break coef_nonzero = (coef != 0).reshape(-1, ) dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] coef_reduced = coef[coef_nonzero] fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) + intercept plt.plot(range(fitbit_df[stream].shape[0]), fitbit_df[stream], 'b-') #plt.plot(range(fitbit_df_smooth.shape[0]), fitbit_df_smooth, 'k-') plt.plot(range(fitbit_df[stream].shape[0]), fit_signal, 'r--') plt.title('OMP (%d coefs) + MP Motifs (%d motifs)' % (num_nonzero, num_motifs)) plt.xlabel('Time') plt.ylabel(stream) plt.show() return pdb.set_trace() # Compute individual matrix profiles (stump) if do_compute_anchored_chains or do_compute_semantic_segmentation: for pid in pids: fitbit_df = data_dict[pid]['fitbit'] for stream in streams: fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) if do_compute_anchored_chains: left_mp_idx = fitbit_mp[:, 2] right_mp_idx = fitbit_mp[:, 3] #atsc_idx = 10 #anchored_chain = stumpy.atsc(left_mp_idx, right_mp_idx, atsc_idx) all_chain_set, unanchored_chain = stumpy.allc( left_mp_idx, right_mp_idx) if do_compute_semantic_segmentation: subseq_len = window_size correct_arc_curve, regime_locations = stumpy.fluss( fitbit_mp[:, 1], L=subseq_len, n_regimes=2, excl_factor=5) # Find the first motif with nearly no NaN values in the stream signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(fitbit_df[stream]. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 5.0 * window_size / 6.0: continue if stream == 'HeartRatePPG': pass # Check for flat heart rate #nan_like_value = 70 #num_valid = np.count_nonzero((fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx+window_size]) #if num_valid < window_size - 2: # continue # Check for linear heart rate over time #residual_threshold = window_size*(4.0**2) #p, res, rank, sing_vals, rcond = np.polyfit(range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx+window_size], deg=1, full=True) #if res < residual_threshold: # continue break num_subplots = 3 if do_compute_semantic_segmentation else 2 fig, axs = plt.subplots(num_subplots, sharex=True, gridspec_kw={'hspace': 0}) plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid), fontsize='30') axs[0].plot(fitbit_df[stream].values) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) axs[0].set_ylabel(stream, fontsize='20') axs[1].plot(fitbit_mp[:, 0]) axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed") axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1], linestyle="dashed") axs[1].set_ylabel('Matrix Profile', fontsize='20') if do_compute_anchored_chains: for i in range(unanchored_chain.shape[0]): y = fitbit_df[stream].iloc[ unanchored_chain[i]:unanchored_chain[i] + window_size] x = y.index.values axs[0].plot(x, y, linewidth=3) if do_compute_semantic_segmentation: axs[2].plot(range(correct_arc_curve.shape[0]), correct_arc_curve, color='C1') axs[0].axvline(x=regime_locations[0], linestyle="dashed") axs[2].axvline(x=regime_locations[0], linestyle="dashed") plt.show() # Compute multi-dimensional matrix profiles (mstump) if do_compute_multimodal_mp: for pid in pids: fitbit_df = data_dict[pid]['fitbit'] data = fitbit_df.loc[:, streams].values mp, mp_indices = stumpy.mstump(data.T, m=window_size) #print("Stumpy's mstump function does not handle NaN values. Skipping multi-dimensional MP") #break # TODO - This code is copied from above. Fix and finish it once mstump supports NaN # Find the first motif with nearly no NaN values in the stream signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(fitbit_df[stream]. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 2: continue if stream == 'HeartRatePPG': # Check for flat heart rate nan_like_value = 70 num_valid = np.count_nonzero( (fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx + window_size]) if num_valid < window_size - 2: continue # Check for linear heart rate over time residual_threshold = window_size * (4.0**2) p, res, rank, sing_vals, rcond = np.polyfit( range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx + window_size], deg=1, full=True) if res < residual_threshold: continue break fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0}) plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid), fontsize='30') axs[0].plot(fitbit_df[stream].values) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) axs[0].set_ylabel(stream, fontsize='20') axs[1].plot(fitbit_mp[:, 0]) axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed") axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1], linestyle="dashed") axs[1].set_ylabel('Matrix Profile', fontsize='20') plt.show() plt.ioff() plt.figure() plt.plot() plt.title('Dummy plot') plt.show() return