Пример #1
0
def apply_algorithm(algorithm, reductions, x_features, y_labels, model_tasks,
                    components_count, graphs, params):
    model = None
    if algorithm == 'random_forest':
        model = RandomForestClassifier(
        )  # has methods: decision_path(X) & apply
    elif algorithm == 'xgb':
        model = XGBClassifier()
    elif algorithm == 'knn':
        model = KNeighborsClassifier()
    elif algorithm == 'kmeans':
        model = KMeans()
    elif algorithm == 'linear_regression':
        model = LinearRegression()
    elif algorithm == 'logreg':
        model = LogisticRegression()
    elif algorithm == 'ridge':
        model = Ridge()
    elif algorithm == 'lasso':
        model = Lasso()
    elif algorithm == 'elastic_net':
        model = ElasticNet()
    elif algorithm == 'dirichlet':
        model = LatentDirichletAllocation()
    elif algorithm == 'lda':
        model = LinearDiscriminantAnalysis()
    elif algorithm == 'mlp':
        model = MLPClassifier()
    else:
        print('unhandled algorithm', algorithm)
        ''' algorithm will also be null if we're just applying reductions '''

    if model is not None or 'reduce' in model_tasks:
        results = {}
        if algorithm and model:
            for k, v in params[algorithm].items():
                model = model.set_params(**{k: v})
            ''' train the original model '''
            result = model_train(algorithm, model, x_features, y_labels,
                                 model_tasks, components_count, graphs)
            if result:
                results['original'] = result
        if 'reduce' in model_tasks:
            ''' reduce features and train a new model on each reduced feature set '''
            iterative_reduced_features = x_features if 'iterative_reduce' in model_tasks else None
            for reduction in reductions:
                features_to_reduce = iterative_reduced_features if 'iterative_reduce' in model_tasks else x_features
                reduced_feature_result = reduce_features(
                    features_to_reduce, y_labels, components_count, algorithm)
                if reduced_feature_result:
                    print('starting features to reduce', features_to_reduce,
                          'reduced features',
                          reduced_feature_result['features'])
                    ''' update iteratively reduced feature set '''
                    iterative_reduced_features = reduced_feature_result[
                        'features'] if 'iterative_reduce' in model_tasks else iterative_reduced_features
                    ''' train new model on reduced features '''
                    if 'score' in model_tasks or 'train' in model_tasks:
                        reduced_feature_model = model_train(
                            algorithm, model,
                            reduced_feature_result['features'], y_labels,
                            model_tasks, components_count, graphs)
                        if reduced_feature_model:
                            iteration_name = ''.join(['iteration', reduction])
                            results[iteration_name] = reduced_feature_model
        if results:
            if 'apply_most_reduced_features' in model_tasks:
                ''' find most reduced feature set across all reduced feature sets & train new model on that most reduced set'''
                reduced_features = filter_reduced_features(results)
                if reduced_features:
                    reduced_feature_model = model_train(
                        algorithm, model, reduced_features, y_labels,
                        model_tasks, components_count, graphs)
                    if reduced_feature_model:
                        results[
                            'iteration_most_reduced'] = reduced_feature_model
            return results
    return False
Пример #2
0
    def run_fit(self):
        # Display ConvergenceWarning only once and not for every item it occurs
        warnings.simplefilter("once", category=ConvergenceWarning)

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=1e-4,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            if y.sum() == 0.0:
                continue

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate(
                        (rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate(
                        (cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate(
                        (values, np.zeros(dataBlock, dtype=np.float32)))

                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                print(
                    "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix

        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(n_items, n_items),
            dtype=np.float32)
Пример #3
0
def test_coefficients_graph_regression():
    model = ElasticNet()
    model.fit(X_train, y_train)
    mr.coefficients_graph(X_train, X_test, model, "regression",
                          "regression_test")
Пример #4
0
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = lassoLarsIC.predict(test_X)
    print "测试集得分:", lassoLarsIC.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, lassoLarsIC.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试ElasticNet类**********"
    # 在初始化ElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5.
    elasticNet = ElasticNet(alpha=1.0, l1_ratio=0.5)
    # 拟合训练集
    elasticNet.fit(train_X, train_Y)
    # 打印模型的系数
    print "系数:", elasticNet.coef_
    print "截距:", elasticNet.intercept_
    print '训练集R2: ', r2_score(train_Y, elasticNet.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = elasticNet.predict(test_X)
    print "测试集得分:", elasticNet.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)
Пример #5
0
 def set_linear_regressors(self):
     self.estimators = [
         # LinearRegression(),
         Ridge(),
         RidgeCV(),
         Lasso(),
         # MultiTaskLasso(),
         ElasticNet(),
         ElasticNetCV(),
         # MultiTaskElasticNet(),
         Lars(),
         LassoLars(),
         OrthogonalMatchingPursuit(),
         BayesianRidge(),
         # ARDRegression(),
         SGDRegressor(),
         PassiveAggressiveRegressor(),
         HuberRegressor(),
         RandomForestRegressor(),
         GradientBoostingRegressor()
     ]
     self.estimator_params = {}
     self.estimator_params['LinearRegression'] = {
         'fit_intercept': [False, True],
         'normalize': [False, True],
         'n_jobs': [1, 2, 3, 4]
     }
     self.estimator_params['Ridge'] = {
         'alpha': [1, 3, 6, 10],
         'fit_intercept': [False, True],
         'normalize': [False, True]
     }
     self.estimator_params['Lasso'] = {
         'alpha': [1, 3, 6, 10],
         'fit_intercept': [False, True],
         'normalize': [False, True],
         'precompute': [False, True]
     }
     self.estimator_params['Lars'] = {
         'fit_intercept': [False, True],
         'verbose': [1, 3, 6, 10],
         'normalize': [False, True],
         'precompute': [False, True]
     }
     self.estimator_params['LassoLars'] = {
         'alpha': [1, 3, 6, 10],
         'fit_intercept': [False, True],
         'verbose': [1, 3, 6, 10],
         'normalize': [False, True],
         'precompute': [False, True]
     }
     self.estimator_params['OrthogonalMatchingPursuit'] = {
         'n_nonzero_coefs': [1, 3, 6, 10],
         'fit_intercept': [False, True],
         'normalize': [False, True],
         'precompute': [False, True]
     }
     self.estimator_params['BayesianRidge'] = {
         'alpha': [0.0000001, 0.00001, 0.001, 0.1],
         'fit_intercept': [False, True],
         'normalize': [False, True],
         'precompute': [False, True]
     }
     self.estimator_params['SGDRegressor'] = {
         'alpha': [0.0000001, 0.00001, 0.001, 0.1],
         'penalty': ['none', 'l2', 'l1', 'elasticnet'],
         'fit_intercept': [False, True]
     }
     self.estimator_params['HuberRegressor'] = {
         'alpha': [0.0000001, 0.00001, 0.001, 0.1],
         'epsilon': [1, 1.35, 2, 5],
         'fit_intercept': [False, True]
     }
     self.estimator_params['HuberRegressor'] = {
         'alpha': [0.0000001, 0.00001, 0.001, 0.1],
         'epsilon': [1, 1.35, 2, 5],
         'fit_intercept': [False, True]
     }
     self.estimator_params['RandomForestRegressor'] = {
         'n_estimators': [60, 100, 150],
         'max_features': ['log2', 'sqrt', 'auto'],
         'criterion': ['mse', 'mae'],
         # 'max_depth': [None, 8, 32, 64],
         # 'min_samples_split': [0.1, 0.2, 0.5, 0.7, 1.0],
         # 'min_samples_leaf': [1,2,5]
     }
     self.estimator_params['GradientBoostingRegressor'] = {
         'n_estimators': [150],
         # 'loss': ['ls', 'lad','huber','quantile'],
         # 'criterion': ['mse', 'mae'],
         # 'max_depth': [None, 3, 5, 8],
         # 'min_samples_split': [0.2, 0.5, 0.9, 1.0],
         # 'min_samples_leaf': [1, 2, 3, 5],
         # 'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5],
         # 'alpha': [0.5, 0.7, 0.9, 1.0, 1.5]
     }
Пример #6
0
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import sklearn
import tensorflow as tf
from tensorflow import keras

## regressors set - up
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
XGBoost = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
LightgBoost = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
Пример #7
0
def objective_function(args):
    n_components = args['n_components']
    quantiles = args['quantiles']
    if args['preprocessing'] == 'NoTransform':
        X, Y, scaler = transform(dataset)
    elif args['preprocessing'] == 'MinMaxScaler':
        X, Y, scaler = transform(dataset)
    elif args['preprocessing'] == 'StandardScaler':
        X, Y, scaler = standard_scaler(dataset)
    elif args['preprocessing'] == 'RobustScaler':
        X, Y, scaler = robust_scaler(dataset)
    elif args['preprocessing'] == 'QuantileTransformer':
        X, Y, scaler = quantile_transformer(dataset, quantiles)
    elif args['preprocessing'] == 'PowerTransformer':
        X, Y, scaler = power_transformer(dataset)
    elif args['preprocessing'] == 'PCA':
        X, Y, scaler = pca_transform(dataset, n_components)
    if args['preprocessing'] != 'PCA':
        k_features = args['k_features']
    else:
        k_features = X.shape[1]
    if args['model'] == RandomForestRegressor:
        n_estimators = args['params']['n_estimators']
        max_depth = args['params']['max_depth']
        min_samples_split = args['params']['min_samples_split']
        min_samples_leaf = args['params']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf']
        max_features = args['params']['max_features']
        max_leaf_nodes = args['params']['max_leaf_nodes']
        estimator = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth,
                  min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf,
                  max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  max_features = max_features, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == AdaBoostRegressor:
        learning_rate = args['params']['learning_rate']
        n_estimators = args['params']['n_estimators']
        loss = args['params']['loss']
        max_depth = args['params']['base_estimator']['max_depth']
        min_samples_split = args['params']['base_estimator']['min_samples_split']
        min_samples_leaf = args['params']['base_estimator']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['base_estimator']['min_weight_fraction_leaf']
        max_features = args['params']['base_estimator']['max_features']
        estimator = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth = max_depth, min_samples_split = min_samples_split,
                  min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  max_features = max_features), learning_rate = learning_rate, n_estimators = n_estimators, loss = loss)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == ExtraTreesRegressor:
        n_estimators = args['params']['n_estimators']
        max_depth = args['params']['max_depth']
        min_samples_split = args['params']['min_samples_split']
        max_features = args['params']['max_features']
        min_samples_leaf = args['params']['min_samples_leaf']
        min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf']
        max_leaf_nodes = args['params']['max_leaf_nodes']
        estimator = ExtraTreesRegressor(n_estimators = n_estimators, max_depth = max_depth,
                  min_samples_split = min_samples_split, max_features = max_features,
                  max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf,
                  min_samples_leaf = min_samples_leaf, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == GradientBoostingRegressor:
        loss = args['params']['loss']
        learning_rate = args['params']['learning_rate']
        n_estimators = args['params']['n_estimators']
        subsample = args['params']['subsample']
        min_samples_split = args['params']['min_samples_split']
        max_depth = args['params']['max_depth']
        tol = args['params']['tol']
        estimator = GradientBoostingRegressor(loss = loss, n_estimators = n_estimators,
                  subsample = subsample, min_samples_split = min_samples_split, learning_rate = learning_rate,
                  max_depth = max_depth, tol = tol)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == SGDRegressor:
        loss = args['params']['loss']
        penalty = args['params']['penalty']
        alpha = args['params']['alpha']
        l1_ratio = args['params']['l1_ratio']
        tol = args['params']['tol']
        learning_rate = args['params']['learning_rate']
        power_t = args['params']['power_t']
        estimator = SGDRegressor(loss = loss, penalty = penalty, alpha = alpha, max_iter = 13000,
                  l1_ratio = l1_ratio, tol = tol, learning_rate = learning_rate, power_t = power_t)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == ElasticNet:
        alpha = args['params']['alpha']
        l1_ratio = args['params']['l1_ratio']
        tol = args['params']['tol']
        estimator = ElasticNet(alpha = alpha, l1_ratio = l1_ratio, tol = tol)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == Ridge:
        alpha = args['params']['alpha']
        tol = args['params']['tol']
        solver = args['params']['solver']
        estimator = Ridge(alpha = alpha, tol = tol, solver = solver)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == KNeighborsRegressor:
        n_neighbors = args['params']['n_neighbors']
        weights = args['params']['weights']
        algorithm = args['params']['algorithm']
        leaf_size = args['params']['leaf_size']
        p = args['params']['p']
        estimator = KNeighborsRegressor(n_neighbors = n_neighbors, weights = weights,
                                        algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = -1)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == GaussianProcessRegressor:
        alpha = args['params']['alpha']
        estimator = GaussianProcessRegressor(alpha = alpha)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == SVR:
        kernel = args['params']['kernel']
        if kernel == 'poly':
            degree = args['params']['degree']
        else:
            degree = 3
        if kernel == 'rbf' or 'poly' or 'sigmoid':
            gamma = args['params']['gamma']
        else:
            gamma = 'auto'
        tol = args['params']['tol']
        C = args['params']['C']
        shrinking = args['params']['shrinking']
        estimator = SVR(kernel = kernel, degree = degree, gamma = gamma, tol = tol, C = C, shrinking = shrinking)
        reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False)
    elif args['model'] == xgb:
        booster = args['params']['booster']
        eta = args['params']['eta']
        gamma = args['params']['gamma']
        max_depth = args['params']['max_depth']
        n_estimators = args['params']['n_estimators']
        min_child_weight = args['params']['min_child_weight']
        subsample = args['params']['subsample']
        alpha = args['params']['alpha']
        random_state = args['params']['random_state']
        colsample_bytree = args['params']['colsample_bytree']
        colsample_bylevel = args['params']['colsample_bylevel']
        colsample_bynode = args['params']['colsample_bynode']
        reg_lambda = args['params']['reg_lambda']
        grow_policy = args['params']['grow_policy']
        if booster == 'dart':
            sample_type = args['params']['sample_type']
            normalize_type = args['params']['normalize_type']
            rate_drop = args['params']['rate_drop']
            skip_drop = args['params']['skip_drop']
        if args['preprocessing'] != 'PCA':
            k_features = args['k_features']
        else:
            k_features = sample(scope.int(hp.quniform('k_features', 1, X.shape[1], 1)))
        if booster == 'gbtree':
            estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators,
                              min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state,
                              colsample_bytree = colsample_bytree, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy,
                              colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1)
            reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric])
        elif booster == 'dart':
            num_round = 50
            estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators,
                              min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state,
                              colsample_bytree = colsample_bytree, sample_type = sample_type, normalize_type = normalize_type,
                              rate_drop = rate_drop, skip_drop = skip_drop, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy,
                              colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1)
            reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric])
    if eval_metric == 'mse':
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 1 - percent_train, random_state = 1, shuffle = False)
        sfsl = reg.fit(X, Y)
        x_sfs = sfsl.transform(X)
        x_train_sfs = x_sfs[:length_train]
        x_test_sfs = x_sfs[length_train:]
        estimator.fit(x_train_sfs, y_train)
        if args['model'] == xgb:
            if booster == "gbtree":
                y_pred = estimator.predict(x_test_sfs)
            elif booster == "dart":
                y_pred = estimator.predict(x_test_sfs, ntree_limit = num_round)
        else:
            y_pred = estimator.predict(x_test_sfs)
        if args['preprocessing'] != 'NoTransform':
            predictions = y_pred.reshape(-1, 1)
            for i in range(predictions.shape[1]):
                if args['preprocessing'] != 'PCA':
                    tmp = np.zeros((predictions.shape[0], n_features))
                else:
                    tmp = np.zeros((predictions.shape[0], X.shape[1]))
                tmp[:, 0] = predictions[:, i]
                predictions[:, i] = scaler.inverse_transform(tmp)[:, 0]
            mse = mean_squared_error(dataset[target][length_train:], predictions)
            print('mse value: {}, model: {}'.format(mse, args['model']))
            return mse
        else:
            mse = mean_squared_error(dataset[target][length_train:], y_pred)
            print('mse value: {}, model: {}'.format(mse, args['model']))
            return mse
    else:
        reg.fit(X, Y)
        print('Model: {}, r2 value: {}, Selected variables {}'.format(args['model'], reg.k_score_, reg.k_feature_names_))
        loss_function = 1 - reg.k_score_
        return loss_function
Пример #8
0
 def training_model(self, param):
     model = ElasticNet(**param)
     return model
Пример #9
0
def simple_experiment(file_path):

    # Read data
    dta = pd.read_csv(file_path)
    dta_clean = dta
    # remove the null values, that is fill NaN with there - FIXME: Rihards, naive implementation
    dta_clean = dta_clean.fillna(value=0, axis=1)
    dta_clean = dta_clean.dropna()
    dta_clean = dta_clean.drop('Unnamed: 0', axis=1)

    #########################
    ####### Models ##########
    #########################
    models_class = [
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
        PassiveAggressiveClassifier(),
        LogisticRegression(),
        RidgeClassifier(),
        SGDClassifier(),
        GaussianNB(),
        MultinomialNB(),
        KNeighborsClassifier(),
        RadiusNeighborsClassifier(),
        NearestCentroid(),
        MLPClassifier(),
        SVC(),
        LinearSVC(),
        NuSVC(),
        DecisionTreeClassifier(),
        ExtraTreeClassifier()
    ]

    models_reg = [
        AdaBoostRegressor(),
        BaggingRegressor(),
        ExtraTreesRegressor(),
        GradientBoostingRegressor(),
        RandomForestRegressor(),
        ElasticNet(),
        HuberRegressor(),
        Lasso(),
        LassoLars(),
        LinearRegression(),
        PassiveAggressiveRegressor(),
        Ridge(),
        SGDRegressor(),
        OrthogonalMatchingPursuit(),
        RANSACRegressor(),
        KNeighborsRegressor(),
        RadiusNeighborsRegressor(),
        MLPRegressor(),
        SVR(),
        LinearSVR(),
        NuSVR(),
        DecisionTreeRegressor(),
        ExtraTreeRegressor()
    ]

    models_cfg = {}

    models_cfg[AdaBoostClassifier.__name__] = {}
    models_cfg[BaggingClassifier.__name__] = {}
    models_cfg[ExtraTreesClassifier.__name__] = {}
    models_cfg[GradientBoostingClassifier.__name__] = {}
    models_cfg[RandomForestClassifier.__name__] = {}
    models_cfg[PassiveAggressiveClassifier.__name__] = {}
    models_cfg[LogisticRegression.__name__] = {}
    models_cfg[RidgeClassifier.__name__] = {}
    models_cfg[SGDClassifier.__name__] = {}
    models_cfg[GaussianNB.__name__] = {}
    models_cfg[MultinomialNB.__name__] = {}
    models_cfg[KNeighborsClassifier.__name__] = {}
    models_cfg[RadiusNeighborsClassifier.__name__] = {}
    models_cfg[NearestCentroid.__name__] = {}
    models_cfg[MLPClassifier.__name__] = {}
    models_cfg[SVC.__name__] = {}
    models_cfg[LinearSVC.__name__] = {}
    models_cfg[NuSVC.__name__] = {}
    models_cfg[DecisionTreeClassifier.__name__] = {}
    models_cfg[ExtraTreeClassifier.__name__] = {}

    models_cfg[AdaBoostRegressor.__name__] = {}
    models_cfg[BaggingRegressor.__name__] = {}
    models_cfg[ExtraTreesRegressor.__name__] = {}
    models_cfg[GradientBoostingRegressor.__name__] = {}
    models_cfg[RandomForestRegressor.__name__] = {}
    models_cfg[BayesianRidge.__name__] = {}
    models_cfg[ElasticNet.__name__] = {}
    models_cfg[HuberRegressor.__name__] = {}
    models_cfg[Lars.__name__] = {}
    models_cfg[Lasso.__name__] = {}
    models_cfg[LassoLars.__name__] = {}
    models_cfg[LinearRegression.__name__] = {}
    models_cfg[PassiveAggressiveRegressor.__name__] = {}
    models_cfg[Ridge.__name__] = {}
    models_cfg[SGDRegressor.__name__] = {}
    models_cfg[OrthogonalMatchingPursuit.__name__] = {}
    models_cfg[RANSACRegressor.__name__] = {}
    models_cfg[TheilSenRegressor.__name__] = {}
    models_cfg[KNeighborsRegressor.__name__] = {}
    models_cfg[RadiusNeighborsRegressor.__name__] = {}
    models_cfg[MLPRegressor.__name__] = {}
    models_cfg[SVR.__name__] = {}
    models_cfg[LinearSVR.__name__] = {}
    models_cfg[NuSVR.__name__] = {}
    models_cfg[DecisionTreeRegressor.__name__] = {}
    models_cfg[ExtraTreeRegressor.__name__] = {}

    ##to run for multiple classes of data, add the tuples of x and y  to the tuples array of data and decsription for the purposes logging. For now it is set to run for all the samples there are. For instance tuples_of_data = [(X,y, "all samples"), (X_1,y_1, "samples class1") , (X_2,y_2", "samples class2")]
    # for each tupple extracted from the array a new log file is going to be generated, so that each run is in a different log file.

    X_all = dta_clean.drop('worldwide_gross', axis=1)
    y_all = dta_clean['worldwide_gross']
    desc = "quickReg" + file_path.replace('.', '').replace('/', '').replace(
        'dataset', '').replace('csv', '')
    tuples_of_data = [(X_all, y_all, desc)]
    #########################
    ### Start Regress########
    #########################
    orig_stdout = sys.stdout  #  save orig datetime and save orign stdout
    time = datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S")
    for ind, tupl in enumerate(tuples_of_data):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # restart the current itterator for each run
            global itter_current
            itter_current = 0
            x_crr, y_crr, dsc = tupl
            trg = "regressRes_" + dsc + "_" + time + ".log"
            new_file = open(trg, "w")
            sys.stdout = new_file
            # set the itterator run to start from
            global itter_start
            itter_start = 0
            run_for_many(x_crr, y_crr, dsc, models_reg, models_cfg)
            new_file.close()

    desc = "quickClass" + file_path.replace('.', '').replace('/', '').replace(
        'dataset', '').replace('csv', '')
    labels = [label_gross_3, label_gross_2]
    #save orig datetime and save orign stdout
    orig_stdout = sys.stdout
    time = datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S")
    for ind, cb in enumerate(labels):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            #restart the current itterator for each run
            global itter_current
            itter_current = 0
            trg = "classifyRes_" + desc + "_" + cb.__name__ + "_" + time + ".log"
            new_file = open(trg, "w")
            sys.stdout = new_file
            #set the itterator run to start from
            global itter_start
            itter_start = 0
            x_crr = dta_clean.drop('worldwide_gross', axis=1)
            y_crr = dta_clean.worldwide_gross.apply(lambda gross: cb(gross))
            dsc = desc + "_" + cb.__name__
            run_for_many(x_crr, y_crr, dsc, models_class, models_cfg)
            new_file.close()

    # reassign the org stdout for some reason
    sys.stdout = orig_stdout
Пример #10
0
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, cross_val_score
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.svm import SVR
#from sklearn.tree import DecisionTreeRegressor
#from xgboost import XGBRegressor


# In[85]:


models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet())]


# In[86]:


for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")


# In[88]:


model = Ridge(alpha=1.0)
model.fit(X, y)
Пример #11
0
    test_rmse = np.sqrt(test_mse)
    return test_rmse


def get_test_r2(model, test_x, test_y):
    """Return the r-squared of the trained model based on the test dataset."""
    test_y_hat = model.predict(test_x)
    test_r2 = r2_score(test_y, test_y_hat)
    return test_r2


# %%
dict_models_explore = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic_net': ElasticNet(),
    'svr': SVR(),
    'decision_tree': DecisionTreeRegressor(),
    'random_forest': RandomForestRegressor(),
    'extra_trees': ExtraTreesRegressor(),
    'gradient_boosting': GradientBoostingRegressor()
}
dict_models = {
    'lasso': Lasso(),
    'elastic_net': ElasticNet(),
    'random_forest': RandomForestRegressor(),
    'gradient_boosting': GradientBoostingRegressor()
}

# %%
train_set, test_set = train_test_split(df_Austin_all_data_adj,
Пример #12
0
def train_linear_model(X,
                       y,
                       random_state=1,
                       test_size=0.2,
                       regularization_type='elasticnet',
                       k_fold=5,
                       max_iter=1000000,
                       tol=0.0001,
                       l1_ratio=None):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [.1, .5, .7, .9, .95, 1]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {
        'random_state': random_state,
        'test_size': test_size,
        'k_fold': k_fold,
        'tol': tol,
        'max_iter': max_iter
    }
    if regularization_type == 'lasso' and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True,
                          alphas=None,
                          tol=tol,
                          cv=k_fold,
                          max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True,
                             alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters['l1_ratio'] = 1

    elif regularization_type == 'ridge' and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = 0

    elif regularization_type == 'elasticnet' and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(fit_intercept=True,
                                    normalize=False,
                                    alphas=None,
                                    cv=k_fold,
                                    l1_ratio=l1_ratio,
                                    max_iter=max_iter)
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(fit_intercept=True,
                                  normalize=False,
                                  l1_ratio=l1_ratio_opt,
                                  alpha=alpha_opt,
                                  max_iter=max_iter)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == 'elasticnet' and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True,
                                                    cv=k_fold,
                                                    normalize=False,
                                                    l1_ratio=l1_ratio,
                                                    max_iter=max_iter)
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True,
                                           normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test - mu) / s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x / y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96 * np.sqrt(
        mean_squared_error(np.ones(y_pred.shape),
                           pred_actual_ratio,
                           multioutput='raw_values') / y_pred.shape[0])
    hyperparameters['alpha'] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
print(score)
print(score.mean())


# In[ ]:


X_train[f_selected].shape


# In[ ]:


from sklearn.linear_model import ElasticNet

elasticnet_reg = ElasticNet(l1_ratio=0.8, random_state=0)
score = cross_val_score(elasticnet_reg, X_train[f_selected], y_train, cv=10)
print(score)
print(score.mean())


# In[ ]:


from sklearn.ensemble import RandomForestRegressor

randomfor_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
score = cross_val_score(randomfor_reg, X_train[f_selected], y_train, cv=10)
print(score)
print(score.mean())
Пример #14
0
def chooseAlgorithm(problemType, features, targets):
    if 'Classification' in problemType:
        models = {
            'RFC':
            RandomForestClassifier(),
            'ETC':
            ExtraTreesClassifier(),
            'GNB':
            GaussianNB(),
            'MNB':
            MultinomialNB(),
            'KNC':
            KNeighborsClassifier(n_neighbors=round(sqrt(len(features.index)))),
            'SVC':
            SVC(),
            'LSVC':
            LinearSVC(),
            'LGR':
            LogisticRegression(),
            'LDA':
            LinearDiscriminantAnalysis(),
            'SDGC':
            SGDClassifier()
        }
    elif 'Regression' in problemType:
        models = {
            'RFR':
            RandomForestRegressor(),
            'ETR':
            ExtraTreesRegressor(),
            'LNR':
            LinearRegression(),
            'SDGR':
            SGDRegressor(),
            'KNR':
            KNeighborsRegressor(n_neighbors=round(sqrt(len(features.index)))),
            'SVR':
            SVR(),
            'LSVR':
            LinearSVR(),
            'Lasso':
            Lasso(),
            'ENET':
            ElasticNet(),
            'Ridge':
            Ridge()
        }
    else:
        raise TypeError([
            'expected either \'classification\' or \'regression\' as problem type'
        ])

    results = {}
    X_train, X_test, y_train, y_test = train_test_split(
        features, targets.values.ravel())
    for name, model in models.items():
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        results[name] = score

    bestModelScore = sorted(results.items(), key=lambda x: x[1],
                            reverse=True)[0]

    model = models[bestModelScore[0]].fit(features, targets)

    return model
    # print("The ALL data size before: {} ".format(all_data.shape))
    all_data = process_data(all_data)
    # print("The ALL data size after: {} \n".format(all_data.shape))

    df_train = all_data[:n_train]
    X_train = df_train.values
    df_test = all_data[n_train:]

    regressor = RandomForestRegressor(n_estimators=300, random_state=0)
    score = rmsle_cv(regressor, X_train, y_train)
    print("\nRandomForestRegressor score: {:.4f} ({:.4f})\n".format(
        score.mean(), score.std()))

    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(),
                         ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    KRR = KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5)
    GBoost = GradientBoostingRegressor(n_estimators=3000,
                                       learning_rate=0.05,
                                       max_depth=4,
                                       max_features="sqrt",
                                       min_samples_leaf=15,
                                       min_samples_split=10,
                                       loss="huber",
                                       random_state=5)
    stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost,
                                                                  KRR),
                                                     meta_model=lasso)

    score = rmsle_cv(lasso, X_train, y_train)
    print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Пример #16
0

lasso.fit(x_train,y_train)


RLasso=['RL-Lasso',
        "%.4f" % lasso.score(x_train,y_train),
        "%.4f" % lasso.score(x_test,y_test)]

### Elastic Net Regression ###


#ENet = GridSearchCV(ElasticNet(random_state=3),param_grid={'alpha':np.logspace(-6,-2,10),
#                    'l1_ratio':np.linspace(0.5,0.9,5)}, cv=5)

ENet = ElasticNet(alpha=0.00046,l1_ratio=0.9,random_state=3)

ENet.fit(x_train,y_train)


ElasticN=['RL-ENet',
        "%.4f" % ENet.score(x_train,y_train),
        "%.4f" % ENet.score(x_test,y_test)]

### Kernel Ridge Regression ###

#KRidge = GridSearchCV(KernelRidge(kernel='polynomial',degree=2,coef0=2.5),
#                      param_grid={'alpha':np.linspace(0.7,0.9,10),
#                                  'gamma': np.logspace(-5,-3,10)}, cv=5)

KRidge = KernelRidge(kernel='polynomial', degree=2, coef0=2.5, alpha=0.9 ,gamma=1e-04)
Пример #17
0
    data = pd.read_csv("wine-quality.csv")

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
Пример #18
0
reg_features_2['intercept'] = 1.0
'''
'''
# =============================================================================
# from sklearn import linear_model
# reg = linear_model.Lasso(alpha=0.1, fit_intercept=True, normalize=True, precompute=True, copy_X=True, max_iter=1000, tol=0.0001, warm_start=True, positive=True, random_state=42, selection='cyclic')
# =============================================================================

from sklearn.linear_model import ElasticNet
reg_1 = ElasticNet(alpha=0.8,
                   l1_ratio=0.8,
                   fit_intercept=True,
                   normalize=False,
                   precompute=True,
                   copy_X=True,
                   max_iter=1000,
                   tol=0.001,
                   warm_start=True,
                   positive=True,
                   random_state=42,
                   selection='cyclic')
reg_2 = ElasticNet(alpha=0.7,
                   l1_ratio=1,
                   fit_intercept=True,
                   normalize=False,
                   precompute=True,
                   copy_X=True,
                   max_iter=1000,
                   tol=0.001,
                   warm_start=True,
                   positive=True,
Пример #19
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

__author__ = 'lebaishi'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15)
elastic_net.fit(X, y)
print(elastic_net.predict(1.5))

sgd_reg = SGDRegressor(penalty='elasticnet', max_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
    def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100):

        assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(
            self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        # Display ConvergenceWarning only once and not for every item it occurs
        warnings.simplefilter("once", category=ConvergenceWarning)

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate(
                        (rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate(
                        (cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate(
                        (values, np.zeros(dataBlock, dtype=np.float32)))

                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                self._print(
                    "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(n_items, n_items),
            dtype=np.float32)
Пример #21
0
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures

np.random.seed(1335)  # for reproducibility


g_models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), KNeighborsRegressor(), DecisionTreeRegressor(), SVR(),RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor()]
#g_models = [LinearRegression(), LinearRegression(),LinearRegression(),LinearRegression()]
g_idx = 0



NUM_LONG_POSITIONS=20
NUM_SHORT_POSITIONS=20

#start = '2016-8-10'   # 必须在国内交易日
#end   = '2017-8-11'  # 必须在国内交易日

c,_ = get_sector_class()
ONEHOTCLASS = tuple(c)

hs300 = ts.get_hs300s()['code']
Пример #22
0
            # score = numpy.sqrt(-numpy.mean(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv)))
            score = numpy.sum(-cross_val_score(
                model, X, y, scoring='neg_mean_squared_error', cv=cv))  # v. 14

            scores.append(score)
            results.append({
                "model": "ridge",
                "fold": fold,
                "alpha": alpha,
                "rmse": score,
                "coefs": model.coef_
            })
            # print("ridge alpha:", alpha, "fold:", fold, "score:", score)
            # print("coefs:", model.coef_, "\n")

            model = ElasticNet(alpha=alpha).fit(X, y)
            # score = numpy.sqrt(-numpy.mean(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv)))
            score = numpy.sum(-cross_val_score(
                model, X, y, scoring='neg_mean_squared_error', cv=cv))  # v. 14

            scores.append(score)
            results.append({
                "model": "elastic",
                "fold": fold,
                "alpha": alpha,
                "rmse": score,
                "coefs": model.coef_
            })
            # print("elastic alpha:", alpha, "fold:", fold, "score:", score)
            # print("coefs:", model.coef_, "\n")
Пример #23
0
# Lasso
from sklearn.linear_model import Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

# #############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

m, s, _ = plt.stem(np.where(enet.coef_)[0],
                   enet.coef_[enet.coef_ != 0],
                   markerfmt='x',
                   label='Elastic net coefficients')
plt.setp([m, s], color="#2ca02c")
m, s, _ = plt.stem(np.where(lasso.coef_)[0],
                   lasso.coef_[lasso.coef_ != 0],
                   markerfmt='x',
                   label='Lasso coefficients')
ax = fig.add_subplot(111)
cax = ax.matshow(dataframe.corr(), vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,31,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(dataframe.columns)
ax.set_yticklabels(dataframe.columns)

num_instances = len(X)

models = []
models.append(('LiR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('Bag_Re', BaggingRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('ExtraTreesRegressor', ExtraTreesRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVM', SVR()))

# Evaluations
results = []
names = []
scoring = []

for name, model in models:
    # Fit the model
    model.fit(X, Y)
Пример #25
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {'n_jobs': -2, 'n_estimators': 30},
        'ExtraTreesClassifier': {'n_jobs': -1},
        'AdaBoostClassifier': {},
        'SGDClassifier': {'n_jobs': -1},
        'Perceptron': {'n_jobs': -1},
        'LinearSVC': {'dual': False},
        'LinearRegression': {'n_jobs': -2},
        'RandomForestRegressor': {'n_jobs': -2, 'n_estimators': 30},
        'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'},
        'ExtraTreesRegressor': {'n_jobs': -1},
        'MiniBatchKMeans': {'n_clusters': 8},
        'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.1, 'warm_start': True},
        'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.1, 'warm_start': True},
        'SGDRegressor': {'shuffle': False},
        'PassiveAggressiveRegressor': {'shuffle': False},
        'AdaBoostRegressor': {},
        'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384},
        'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384},
        'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
        'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500


    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:')
        print(model_params)


    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),


        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),

        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier()
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(calc_feature_importance=True)

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize')
        raise(e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
def enet_solve(c,b):
    regr = ElasticNet(random_state=0,max_iter=10000)
    regr.fit(c,b)
    return regr.coef_
Пример #27
0
    ll = sum(act * sp.log(pred) +
             sp.subtract(1, act) * sp.log(sp.subtract(1, pred)))
    ll = ll * -1.0 / len(act)
    return ll


# add two columns for hour and weekday
def dayhour(timestr):
    d = datetime.strptime(str(x), "%y%m%d%H")
    return [float(d.weekday()), float(d.hour)]


fh = FeatureHasher(n_features=2**20, input_type="string")

# Train classifier
clf = ElasticNet()
train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(
        pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace=True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)

# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(
Пример #28
0
#Lasso Regression Models
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train)
lasso_y_pred = reg.predict(X_test)
reg.coef_
reg.intercept_
print("Mean squared error: %.2f" %
      mean_squared_error(diabetes_y_test, lasso_y_pred))
print('Variance score: %.2f' % r2_score(diabetes_y_test, lasso_y_pred))

#Elastic Nets
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNet(random_state=0)
regr.fit(X_train, y_train)
print(regr.coef_)
print(regr.intercept_)
y_pred_elas = regr.predict(X_test)

print("Mean squared error: %.2f" %
      mean_squared_error(diabetes_y_test, y_pred_elas))
print('Variance score: %.2f' % r2_score(diabetes_y_test, y_pred_elas))

#Ridge Regression
from sklearn import linear_model
reg = linear_model.Ridge(alpha=5)
reg.fit(X_train, y_train)
reg.coef_
reg.intercept_
Пример #29
0
    def __init__(self, mod_name, mod_params={}, param_grid={}, outdir='./'):
        self.mod_name = mod_name
        self.metric = None
        self.mod_params = mod_params
        self.outdir = outdir
        self.param_grid = param_grid

        # regression
        if self.mod_name == 'lm':
            self.model = LinearRegression()  # linear regression model
            self.metric = 'R2'
        elif self.mod_name == 'elasticNet':
            alpha = mod_params['alpha'] if 'alpha' in mod_params else 0.03
            l1_ratio = mod_params[
                'l1_ratio'] if 'l1_ratio' in mod_params else 0.5
            self.model = ElasticNet(
                alpha=alpha,
                l1_ratio=l1_ratio)  #l1=lasso; l2=ridge; l1_ratio=1 means l1
            self.metric = 'R2'
        elif self.mod_name == 'rf':
            n_estimators = mod_params[
                'n_estimators'] if 'n_estimators' in mod_params else 100
            max_depth = mod_params[
                'max_depth'] if 'max_depth' in mod_params else 10
            min_samples_leaf = mod_params[
                'min_samples_leaf'] if 'min_samples_leaf' in mod_params else 5
            max_features = mod_params[
                'max_features'] if 'max_features' in mod_params else 'sqrt'
            self.model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
                oob_score=True,
                random_state=42)  #random forest
            self.metric = 'R2'
        elif self.mod_name == 'svr':
            self.model = SVR(kernel=mod_params['kernel'],
                             C=mod_params['C'])  #SVR linear kernel
            self.metric = 'R2'
        elif self.mod_name == 'dummy_reg':
            self.model = DummyRegressor(quantile=0.5)
            self.metric = 'R2'
        elif self.mod_name == 'mlp':
            # TODO note neural-net based models were tested separately, some of the code are integrated here, but \
            #  MLPs are not fully functional yet with the libraries written here
            K.clear_session()
            input_data = Input(shape=(mod_params['feat_size'], ))

            for n in range(mod_params['num_layers']):
                if (n == 0):
                    x = input_data
                #dense_name = 'Dense_%d' % n
                x = Dense(64 * (2**(2 * (mod_params['num_layers'] - n - 1))),
                          kernel_regularizer=regularizers.l1(0.01))(x)
                x = BatchNormalization()(x)
                x = Activation('relu')(x)
                x = Dropout(0.5)(x)
            encoded = Dense(1)(x)

            model = Model(input_data, encoded)
            model.compile(optimizer='adam', loss='mean_squared_error')

            self.model = model
            self.metric = 'R2'

        # classification
        elif self.mod_name == 'mlp_classify':
            K.clear_session()
            input_data = Input(shape=(mod_params['feat_size'], ))

            for n in range(mod_params['num_layers']):
                if (n == 0):
                    x = input_data
                # dense_name = 'Dense_%d' % n
                x = Dense(64 * (2**(2 * (mod_params['num_layers'] - n - 1))),
                          kernel_regularizer=regularizers.l1(0.01))(x)
                x = BatchNormalization()(x)
                x = Activation('relu')(x)
                x = Dropout(0.5)(x)
            encoded = Dense(1, activation='sigmoid')(x)

            model = Model(input_data, encoded)
            model.compile(optimizer='adam', loss='binary_crossentropy')

            self.model = model
            self.metric = 'AUC'
        elif self.mod_name == 'logit':
            self.model = LogisticRegression(penalty='l2',
                                            class_weight='balanced')
            self.metric = 'AUC'
        elif self.mod_name == 'rfc':
            n_estimators = mod_params[
                'n_estimators'] if 'n_estimators' in mod_params else 100
            max_depth = mod_params[
                'max_depth'] if 'max_depth' in mod_params else 10
            min_samples_leaf = mod_params[
                'min_samples_leaf'] if 'min_samples_leaf' in mod_params else 5
            max_features = mod_params[
                'max_features'] if 'max_features' in mod_params else 'sqrt'
            self.model = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
                oob_score=True,
                class_weight='balanced',
                random_state=42)
            self.metric = 'AUC'
        elif self.mod_name == 'xgboost':
            # Not in use - XGBClassifier has additional requirements that need to be configured
            # n_estimators = mod_params['n_estimators'] if 'n_estimators' in mod_params else 100
            # self.model = XGBClassifier(n_estimators=n_estimators)
            # self.metric = 'AUC'
            self.model = None
            self.metric = 'AUC'
regressors = [
    LinearRegression(fit_intercept=SET_FIT_INTERCEPT),
    Ridge(alpha=1,
          solver='cholesky',
          fit_intercept=SET_FIT_INTERCEPT,
          normalize=False,
          random_state=RANDOM_SEED),
    Lasso(alpha=0.1,
          max_iter=10000,
          tol=0.01,
          fit_intercept=SET_FIT_INTERCEPT,
          random_state=RANDOM_SEED),
    ElasticNet(alpha=0.1,
               l1_ratio=0.5,
               max_iter=10000,
               tol=0.01,
               fit_intercept=SET_FIT_INTERCEPT,
               normalize=False,
               random_state=RANDOM_SEED)
]

from sklearn.model_selection import KFold

N_FOLDS = 506  # leave one out cross validation N = number of observations

cv_results = np.zeros(
    (N_FOLDS, len(names)
     ))  # array with as many rows as folds and as many columns for names

dv_results = np.zeros((N_FOLDS, len(names)))