예제 #1
0
def build_model(model_type, num_targets = 1):
    if model_type == 'linear_regression':
        base = linear_model.SGDRegressor()
    elif model_type == 'random_forests':
        base = ensemble.RandomForestRegressor()
    elif model_type == 'gradient_boosting':
        base = ensemble.GradientBoostingRegressor()
    elif model_type == 'extra_trees':
        base = ensemble.ExtraTreesRegressor()
    elif model_type == 'bagging':
        base = ensemble.BaggingRegressor()
    elif model_type == 'adaboost':
        base = ensemble.AdaBoostRegressor()
    elif model_type == 'neural_network':
        base = neural_network.MLPRegressor()
    elif model_type == 'svm':
        base = svm.SVR(verbose=1)
    elif model_type == 'constant_mean':
        base = dummy.DummyRegressor('mean')
    elif model_type == 'constant_median':
        base = dummy.DummyRegressor('median')
    elif model_type == 'constant_zero':
        base = dummy.DummyRegressor('constant', constant=0)
    else:
        raise(ValueError('invalid model type: {}'.format(model_type)))

    # multiple outputs in the dataset => fit a separate regressor to each
    if num_targets > 1:
        return multioutput.MultiOutputRegressor(base)
    else:
        return base
예제 #2
0
def deserialize_gradient_boosting_regressor(model_dict):
    model = GradientBoostingRegressor(**model_dict['params'])
    trees = [
        deserialize_decision_tree_regressor(tree)
        for tree in model_dict['estimators_']
    ]
    model.estimators_ = np.array(trees).reshape(model_dict['estimators_shape'])
    if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy':
        model.init_ = dummy.DummyRegressor()
        model.init_.__dict__ = model_dict['init_']
        model.init_.__dict__.pop('meta')

    model.train_score_ = np.array(model_dict['train_score_'])
    model.max_features_ = model_dict['max_features_']
    model.n_features_ = model_dict['n_features_']
    if model_dict['loss_'] == 'ls':
        model.loss_ = _gb_losses.LeastSquaresError(1)
    elif model_dict['loss_'] == 'lad':
        model.loss_ = _gb_losses.LeastAbsoluteError(1)
    elif model_dict['loss_'] == 'huber':
        model.loss_ = _gb_losses.HuberLossFunction(1)
    elif model_dict['loss_'] == 'quantile':
        model.loss_ = _gb_losses.QuantileLossFunction(1)

    if 'priors' in model_dict:
        model.init_.priors = np.array(model_dict['priors'])
    return model
예제 #3
0
 def build_sklearn(self, model_id, model_params):
     """Method that builds models implemented in sklearn"""
     if model_id == 'sklearn_LogisticRegressionCV':
         return linear_model.LogisticRegressionCV(**model_params)
     if model_id == 'sklearn_LogisticRegression':
         return linear_model.LogisticRegression(**model_params)
     elif model_id == 'sklearn_MLPClassifier':
         return neural_network.MLPClassifier(**model_params)
     elif model_id == 'sklearn_GaussianNB':
         return naive_bayes.GaussianNB(**model_params)
     elif model_id == 'sklearn_MultinomialNB':
         return naive_bayes.MultinomialNB(**model_params)
     elif model_id == 'sklearn_BernoulliNB':
         return naive_bayes.BernoulliNB(**model_params)
     elif model_id == 'sklearn_RandomForestClassifier':
         return ensemble.RandomForestClassifier(**model_params)
     elif model_id == 'sklearn_SVC':
         return svm.SVC(**model_params)
     elif model_id == 'sklearn_AdaBoostClassifier':
         return ensemble.AdaBoostClassifier(**model_params)
     elif model_id == 'sklearn_SGDClassifier':
         return linear_model.SGDClassifier(**model_params)
     elif model_id == 'sklearn_PassiveAggressiveClassifier':
         return linear_model.PassiveAggressiveClassifier(**model_params)
     elif model_id == 'sklearn_RidgeClassifier':
         return linear_model.RidgeClassifier(**model_params)
     elif model_id == 'sklearn_DummyClassifier':
         return dummy.DummyClassifier(**model_params)
     elif model_id == 'sklearn_KNeighborsClassifier':
         return neighbors.KNeighborsClassifier(**model_params)
     elif model_id == 'sklearn_DecisionTreeClassifier':
         return tree.DecisionTreeClassifier(**model_params)
     elif model_id == 'sklearn_LinearRegression':
         return linear_model.LinearRegression(**model_params)
     elif model_id == 'sklearn_LassoCV':
         return linear_model.LassoCV(**model_params)
     elif model_id == 'sklearn_RidgeCV':
         return linear_model.RidgeCV(**model_params)
     elif model_id == 'sklearn_Ridge':
         return linear_model.Ridge(**model_params)
     elif model_id == 'sklearn_DummyRegressor':
         return dummy.DummyRegressor(**model_params)
     elif model_id == 'sklearn_RandomForestRegressor':
         return ensemble.RandomForestRegressor(**model_params)
     elif model_id == 'sklearn_GradientBoostingRegressor':
         return ensemble.GradientBoostingRegressor(**model_params)
     elif model_id == 'sklearn_MLPRegressor':
         return neural_network.MLPRegressor(**model_params)
     elif model_id == 'sklearn_KNeighborsRegressor':
         return neighbors.KNeighborsRegressor(**model_params)
     elif model_id == 'sklearn_SVR':
         return svm.SVR(**model_params)
     elif model_id == 'sklearn_SGDRegressor':
         return linear_model.SGDRegressor(**model_params)
     elif model_id == 'sklearn_DecisionTreeRegressor':
         return tree.DecisionTreeRegressor(**model_params)
     return None
예제 #4
0
def medDumb(X_train=[], X_test=[], y_train=[], y_test=[]):
    if len(X_train) == 0:
        return "Médiane (naïf)"  #string name
    else:
        dum = dummy.DummyRegressor(strategy='median')
        dum.fit(X_train, y_train)
        y_pred_dum = dum.predict(X_test)
        return np.sqrt(metrics.mean_squared_error(y_test,
                                                  y_pred_dum)), y_pred_dum
예제 #5
0
def regBaseline(data):
    strategies = ['mean', 'median']
    baseDict = {}
    X, y, features = data.get_data(target=data.default_target_attribute,
                                   return_attribute_names=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    for strat in strategies:
        clf = dummy.DummyRegressor(strategy=strat)
        clf.fit(X_train, y_train)
        baseDict[strat] = clf.score(X_test, y_test)
    return baseDict, y
예제 #6
0
    def __pred_randomly(self, X_train, y_train, X_test):

        dummyX_train = [[0] for x in X_train]
        dummyX_test = [[0] for x in X_test]

        clf = None

        if self.dataset.type == 'c':
            clf = dummy.DummyClassifier(strategy=self.dummy_strategy)
        else:
            clf = dummy.DummyRegressor()

        clf.fit(dummyX_train, y_train)
        return clf.predict(dummyX_test)
예제 #7
0
 def zero_cost_model(self,X,y,add_to_model=False):
     if self.base_model._estimator_type=='classifier':
         model = dummy.DummyClassifier("prior") 
     elif self.base_model._estimator_type=='regressor':
         model = dummy.DummyRegressor("mean")
     else: raise TypeError("sklearn Classifier or Regressor required!")
     cost = 0
     features = []
     model.fit(self.selectfeats(X,features),y)
     if add_to_model:
         self.model_costs.insert(0,cost)
         self.model_features.insert(0,features)
         self.models.insert(0,model)
     return (model, cost, features)
예제 #8
0
 def fit_baseline(self, x, y):
     '''
     Fit the baseline for the MetaEstimator. That is, depending on the loss function, determine
     the optimal constant predictor, based on the training data on the output
     '''
     
     # Determine if regression or classification problem
     if self.method_type is None:
         is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical
         self.method_type = ('classif','regr')[is_above]
     
     # Fit a Dummy (constant) estimator
     if self.method_type == 'regr':
         self.fitted = dummy.DummyRegressor().fit(x, y)
     else:
         self.fitted = dummy.DummyClassifier().fit(x, y)
         self.classes = dummy.DummyClassifier().fit(x, y).classes_
예제 #9
0
from sklearn import preprocessing, dummy, svm
from sklearn import linear_model, neighbors, ensemble
print("PyPlot...", end='', flush=True)
import matplotlib.pyplot as plt
print("ALJI code...", end='', flush=True)
from Framing import getFrame, getEmpathCols
from ModelComparer import ModelComparer
print("Done!")
''' RUNNING OPTIONS FOR MODELS '''
visualizeCGI = True
folds = 5
scaler = preprocessing.StandardScaler(copy=False)
# scaler = preprocessing.MinMaxScaler(copy=False)

regressors = [
    dummy.DummyRegressor(),
    svm.LinearSVR(tol=1),
    svm.SVR(kernel='rbf', gamma='scale'),
    linear_model.Ridge(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor(),
    ensemble.AdaBoostRegressor()
]

classifiers = [
    dummy.DummyClassifier(),
    svm.LinearSVC(tol=1),
    svm.SVC(kernel='rbf', gamma='scale'),
    neighbors.KNeighborsClassifier(),
예제 #10
0
def fabrication_modele_feature_delay(input_X_Train,
                                     d_features,
                                     n_feature_cible,
                                     isRidge=False,
                                     isLasso=False,
                                     input_X_test=None):
    name = 'fabrication_modele_feature_delay'

    data = input_X_Train

    # ON s'assure que la feature est bien presente
    if n_feature_cible not in data.columns:
        log_info('!!!! ERREUR dans {} : feature {} non presente'.format(
            name, n_feature_cible))
        return None, None

    tstamp1 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')

    # Les variables utiles
    l_numerical = d_features['l_numerical']
    l_categoriel = d_features['l_categoriel']

    # Debut
    log_info('-- DEBUT de {} : {} - {}'.format(name, 'Preparation', tstamp1))

    ## 1 - Entrainement de l'Encodage categoriel
    encoder = OneHotEncoder(sparse=True)
    #encoder.fit(data[l_categoriel])
    #### HACK : on ajoute input_X_test pour éviter que OneHotEncoder
    ## ne tombe sur des données non encore rencontrées !!!!

    ####JE PENSE QUE CELA RAJOUTE BEAUCOUP DE TEMPS
    ###tmp_data = input_X_Train.copy()
    tmp_data = input_X_Train
    tmp_data = tmp_data.append(input_X_test)
    encoder.fit(tmp_data[l_categoriel])

    X_data = data[(l_numerical + l_categoriel)]
    Y_data = data[n_feature_cible]

    ## 3 - Préparation Modélisation Générale
    ## 3_1 - Standardisation des données numériques
    scaler = StandardScaler()

    #### Entrainement
    scaler.fit(X_data[l_numerical])
    #### Transformation
    X_data_numerical = sparse.csr_matrix(scaler.transform(X_data[l_numerical]))

    ## 3_2 - Encodage des données categorielles
    X_data_categoriel = encoder.transform(X_data[l_categoriel])

    ## 3_3 - Fabrication des données optimisées
    #print('X_train_numerical.shape = ', X_train_numerical.shape)
    #print('X_train_categoriel.shape = ', X_train_categoriel.shape)

    Opt_X_data = sparse.hstack((X_data_numerical, X_data_categoriel))
    #Opt_X_test  = sparse.hstack(X_test_numerical, X_test_categoriel)

    tstamp2 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')
    log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Preparation',
                                                     tstamp2,
                                                     tstamp2 - tstamp1))

    ## 4 - Modélisation
    ## 4_1 - Modélisation Linéaire
    log_info('-- DEBUT de {} : {} - {}'.format(name, 'Modélisation', tstamp2))

    tstamp_lr1 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
    lr = LinearRegression()
    lr.fit(Opt_X_data, Y_data)
    tstamp_lr2 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

    ## Regression Naive
    dum = dummy.DummyRegressor(strategy='mean')
    tstamp_dum1 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
    dum.fit(Opt_X_data, Y_data)
    tstamp_dum2 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

    tstamp3 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')
    log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Modélisation',
                                                     tstamp3,
                                                     tstamp3 - tstamp2))

    ## 5 - Sauvegarde des données
    #N_Data = {'X_train': X_train, 'Y_train': Y_train,
    #          'X_test': X_test, 'Y_test': Y_test}
    F_Model_Optimisation = {'OneHotEncoder': encoder, 'StandardScaler': scaler}
    F_Model = {
        'LinearRegression': {
            'Model': lr,
            'Temps': tstamp_lr2 - tstamp_lr1
        }
    }
    F_Model['Naive'] = {'Model': dum, 'Temps': tstamp_dum2 - tstamp_dum1}

    if isRidge:
        ridge = RidgeCV(fit_intercept=False, cv=3)
        tstamp_ridge1 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        ridge.fit(Opt_X_data, Y_data)
        tstamp_ridge2 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        F_Model['RidgeCV'] = {
            'Model': ridge,
            'Temps': tstamp_ridge2 - tstamp_ridge1
        }

    if isLasso:
        lasso = LassoCV(fit_intercept=False, cv=3)
        tstamp_lasso1 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        lasso.fit(Opt_X_data, Y_data)
        tstamp_lasso2 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        F_Model['LassoCV'] = {
            'Model': lasso,
            'Temps': tstamp_lasso2 - tstamp_lasso1
        }

    return F_Model_Optimisation, F_Model
예제 #11
0
    def _tune_classifiers(
            self,
            test_size,
            classifiers,
            min_pca,
            search_n_jobs):

        print('Tuning classifiers for output-feature "{0}" for path "{1}".'.format(
            self.output_feature.feature_name,
            self.path))

        if not self.subset_has_data('training'):
            print('    -> Missing training data...')
            raise KeyError('Missing training data')

        self.tune_params = {}
        self.tune_params['classifiers'] = {}

        if self.subset_has_data('testing'):
            X_tr = self.X
            y_tr = self.y
            X_te = self.data['testing']['X']
            y_te = self.data['testing']['y']
        else:
            # Create training and testing partition (validation
            # split is handled by the 'StratifiedKFold' object)
            X_tr, X_te, y_tr, y_te = skms.train_test_split(
                self.X,
                self.y,
                stratify=self.y,
                shuffle=True,
                random_state=0,
                test_size=test_size)

        print('    -> Train size: ({0}); Test size: ({1}) Number of classes ({2}).'.format(
            X_tr.shape[0],
            X_te.shape[0],
            len(self.classes)))

        if len(self.classes) > 1:

            for classifier_name, classifier_value in classifiers.items():

                print('    -> Tuning "{0}"'.format(classifier_name))

                # Search grid
                dict_grid = {}

                self.tune_params['classifiers'][classifier_name] = {
                    'results': {},
                    'parameters': {},
                    'best_estimator': None
                }

                # Add standard scaller
                steps = [
                    ('std', skp.StandardScaler())
                ]

                if min_pca is not None and X_tr.shape[0] > X_tr.shape[1] and X_tr.shape[1] > min_pca:
                    steps.append(
                        ('pca', skd.PCA(
                            random_state=0)))

                    dict_grid['pca__n_components'] = HierarchyElement.get_pca_nb_components(
                        min_pca, X_tr.shape[1], 3)

                if classifier_name == 'SGDClassifier':
                    steps.append(
                        (classifier_name, sklm.SGDClassifier(
                            shuffle=True,
                            random_state=0,
                            max_iter=1000,
                            penalty='l2',
                            loss='log',
                            class_weight='balanced',
                            n_jobs=2)))

                elif classifier_name == 'RandomForestClassifier':
                    steps.append(
                        (classifier_name, skle.RandomForestClassifier(
                            random_state=0,
                            max_depth=None,
                            class_weight='balanced',
                            n_jobs=2)))

                # Create a pipeline for the work to be done
                pipe = skpl.Pipeline(steps)

                for param_name, param_value in classifier_value.items():
                    # Add the search space to the grid
                    dict_grid['{0}__{1}'.format(
                        classifier_name, param_name)] = param_value

                # create the k-fold object
                kfold = skms.StratifiedKFold(
                    n_splits=5,
                    random_state=0,
                    shuffle=True)

                search = skms.GridSearchCV(
                    estimator=pipe,
                    param_grid=dict_grid,
                    scoring='f1_weighted',
                    refit=True,
                    cv=kfold,
                    n_jobs=2)

                # capture start time
                start_time = ti.time()

                search.fit(
                    X=X_tr,
                    y=y_tr)

                elapsed_time = dt.timedelta(
                    seconds=int(round(ti.time() - start_time)))

                # capture elapsed time
                self.tune_params['classifiers'][classifier_name]['fit_time'] = elapsed_time.total_seconds()

                # capture all tuning parameters
                self.tune_params['classifiers'][classifier_name]['parameters'].update(search.best_params_)

                # keep the best estimator
                self.tune_params['classifiers'][classifier_name]['best_estimator'] = search.best_estimator_

                # capture the scores
                self.tune_params['classifiers'][classifier_name]['results'] = {
                    'validation': search.best_score_,
                    'test': search.score(
                        X=X_te, 
                        y=y_te)
                }

                print('        -> Best validation score: {0:.4%}'.format(
                    self.tune_params['classifiers'][classifier_name]['results']['validation']))

                print('        -> Test score: {0:.4%}'.format(
                    self.tune_params['classifiers'][classifier_name]['results']['test']))

                print(
                    '        -> Tuning time: {0} ({1}s)'.format(elapsed_time, elapsed_time.total_seconds()))

        else:
            classifier_name = 'DummyRegressor'

            print('    -> Tuning "{0}"'.format(classifier_name))

            self.tune_params['classifiers'][classifier_name] = {}

            estimator = skpl.Pipeline(
                steps=[
                    (classifier_name, sky.DummyRegressor(
                        strategy='constant', 
                        constant=0))
                ])

            estimator.fit(
                X=X_tr,
                y=y_tr)

            self.tune_params['classifiers'][classifier_name]['results'] = {
                'validation': 1.0,
                'test': 1.0
            }

            self.tune_params['classifiers'][classifier_name]['fit_time'] = 0
            self.tune_params['classifiers'][classifier_name]['parameters'] = {}
            self.tune_params['classifiers'][classifier_name]['best_estimator'] = estimator

        # find the model with the best results.
        all_classifiers = list(self.tune_params['classifiers'].keys())
        all_results = [self.tune_params['classifiers'][classifier]
                       ['results']['test'] for classifier in all_classifiers]

        best_estimator_index = np.argmax(all_results)
        best_estimator_name = all_classifiers[best_estimator_index]

        best_estimator = self.tune_params['classifiers'][best_estimator_name]['best_estimator']

        # We need to check whether the best estimator implements the 'predict_proba' method.
        # If it does, we can 1) calibrate the estimator and 2) compute the optimized thresholds. 
        if ch.MulticlassClassifierOptimizer.optimizable_model(best_estimator):

            print('    -> Optimizing "{0}"'.format(classifier_name))

            # Create a calibrated estimator
            optimized_estimator = ch.MulticlassClassifierOptimizer(
                model=best_estimator,
                classes=self.classes,
                scoring_function=ch.BinaryClassifierHelper.f1_score)

            self.estimator = optimized_estimator.fit(
                X=X_tr,
                y=y_tr)

            self.tune_params['classifiers'][best_estimator_name]['results']['train_optimized'] = optimized_estimator.score(
                X=X_tr,
                y=y_tr)

            self.tune_params['classifiers'][best_estimator_name]['results']['test_optimized'] = optimized_estimator.score(
                X=X_te,
                y=y_te)
        else:
            self.estimator = self.tune_params['classifiers'][best_estimator_name]['best_estimator']

        self.tune_params['best_score'] = self.tune_params['classifiers'][best_estimator_name]['results']['test']
        self.tune_params['best_classifier'] = best_estimator_name

        print('    -> Best classifier is "{0}" with a test score of {1:.4%}'.format(
            best_estimator_name,
            self.tune_params['best_score']))

        if 'test_optimized' in self.tune_params['classifiers'][best_estimator_name]['results']:

            print('    -> Optimized test score: {0:.4%}'.format(
                self.tune_params['classifiers'][best_estimator_name]['results']['test_optimized']))
 def __init__(self):
     self._regressor = dummy.DummyRegressor()
     self._regressor_name = 'mean'
     self._name = 'Naive Mean'
     self._color = 'purple'
     Model.__init__(self)
예제 #13
0
def main():
    # Read raw data.
    # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality
    raw_data = pd.read_csv('winequality-white.csv', sep=';')
    print('raw_data :\n', raw_data.head())

    # Extract data from dataset.
    x = raw_data[raw_data.columns[:-1]].values # Dataset: variables.
    y = raw_data['quality'].values # Dataset: labels.
    print('x :\n', x[:5])
    print('y :\n', y[:5])

    # Scale data to reduce weights.
    # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
    # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
    std_scale = preprocessing.StandardScaler().fit(x)
    x_scaled = std_scale.transform(x)
    std_scale = preprocessing.MinMaxScaler().fit(y.reshape(-1, 1))
    y_scaled = std_scale.transform(y.reshape(-1, 1)).ravel()

    for var, lbl in zip([x, x_scaled], ['not scaled', 'scaled']):
        fig, all_axis = plt.subplots(3, 4)
        for feat_idx in range(var.shape[1]):
            # variable alone.
            axis = all_axis.ravel()[feat_idx]
            axis.hist(var[:, feat_idx], bins=50)
            axis.set_title(raw_data.columns[feat_idx]+' - '+lbl, fontsize=14)
            # variable superimposed with others.
            last_axis = all_axis.ravel()[11]
            last_axis.hist(var[:, feat_idx], bins=50)
            last_axis.set_title('whole dataset - '+lbl, fontsize=14)
        plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.3, hspace=0.3)
        plt.show() # Show variable magnitude before / after scaling.

    # Split data set into training set and testing set.
    # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees
    x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.3)

    # Fix hyper-parameters to test.
    param_grid = {
                'gamma': np.logspace(-2, 2, 6), # gamma coefficient between 10^-2 and 10^2.
                'alpha': np.logspace(-2, 2, 6), # alpha coefficient between 10^-2 and 10^2.
            }

    # Choose a score to optimize: r2 (coefficient of determination: regression score).
    score = 'r2'

    # Kernel ridge regressor: use cross validation to find the best hyper-parameters.
    clf = GridSearchCV(
        kernel_ridge.KernelRidge(kernel='rbf'), # Kernel ridge regressor.
        param_grid,     # hyper-parameters to test.
        cv=5,           # number of folds to test in cross validation.
        scoring=score   # score to optimize.
    )

    # Optimize best regressor on training set.
    clf.fit(x_train, y_train)

    # Print hyper-parameters.
    print("\nBest hyper-parameters on the training set:")
    print(clf.best_params_)

    # Print performances.
    print("\nCross validation results:")
    for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], clf.cv_results_['params']):
        print("{} = {:.3f} (+/-{:.03f}) for {}".format(score, mean, std*2, params))

    # Print scores.
    # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308276-evaluez-un-algorithme-de-regression
    y_pred = clf.predict(x_train)
    print("\nBest regressor score on training set: {:.3f}".format(r2_score(y_train, y_pred)))
    y_pred = clf.predict(x_test)
    print("\nBest regressor score on testing set: {:.3f}".format(r2_score(y_test, y_pred)))

    # Compare with baseline dummy regressor.
    best_dclf, best_dclf_score = None, -float('inf')
    for s in ['mean', 'median', 'quantile']:
        dclf = dummy.DummyRegressor(strategy=s, quantile=0.25)
        dclf.fit(x_train, y_train)
        dclf_score = r2_score(y_test, dclf.predict(x_test))
        if dclf_score > best_dclf_score:
            best_dclf, best_dclf_score = dclf, dclf_score
    y_pred = best_dclf.predict(x_train)
    print("\nBest dummy regressor score on training set: {:.3f}".format(r2_score(y_train, y_pred)))
    y_pred = best_dclf.predict(x_test)
    print("\nBest dummy regressor score on testing set: {:.3f}".format(r2_score(y_test, y_pred)))
예제 #14
0
# -*- coding: utf-8 -*-
"""
Created on Thu Mar  5 14:12:14 2020

@author: 766810
"""

from sklearn.datasets import make_regression, make_classification

X, y = make_regression()
from sklearn import dummy

fakeestimator = dummy.DummyRegressor(strategy='median')
fakeestimator.fit(X, y)
print(fakeestimator.predict(X)[:5])
lr_best = lr_gs.best_estimator_
y_pred_lr = lr_best.predict(scaler.transform(X_test))
print(calculate_regression_metrics(y_test, y_pred_lr))

#Write the prediction of GLM model
meta_X["predictions"] = y_pred_lr
meta_X["labels"] = y_test
rev_output_df = meta_X.iloc[:, [0, 2, 4, 5]].copy()
rev_output_df.to_csv("../results/GLM_" + data_type_options[input_option] +
                     "_supervised_test_predictions.csv",
                     index=False)

# +
#Dummy mean regressor and median regressor
strategy = 'mean'
model = dummy.DummyRegressor(strategy=strategy)
model.fit(X_train, y_train)
y_pred_mean = model.predict(X_test)
calculate_regression_metrics(y_test, y_pred_mean)

strategy = 'median'
model = dummy.DummyRegressor(strategy=strategy)
model.fit(X_train, y_train)
y_pred_median = model.predict(X_test)
calculate_regression_metrics(y_test, y_pred_median)

# +
##Get results for SARS-COV-2
#big_X_test = pd.read_csv("../data/COVID-19/sars_cov_2_additional_drug_viral_interactions_to_predict_with_LS_v2.csv",header='infer',sep=",")
#total_length = len(big_X_test.columns)
#X_test = big_X_test.iloc[:,range(8,total_length)]
예제 #16
0
from sklearn.datasets import make_regression, make_classification
from sklearn import dummy
x, y = make_regression()
fakeestimator = dummy.DummyRegressor()
#fakeestimator = dummy.DummyRegressor(strategy="median")
fakeestimator.fit(x, y)
print(fakeestimator.predict(x)[:5])
x, y = make_regression()
fakeestimator = dummy.DummyRegressor(strategy="median")
fakeestimator.fit(x, y)
print(fakeestimator.predict(x)[:5])
x, y = make_classification()
fakeestimator = dummy.DummyRegressor(strategy="median")
fakeestimator.fit(x, y)
print(fakeestimator.predict(x)[:5])
        test_size=0.33,
        random_state=42)

    # reweight outliers
    weighter_scale = preprocessing.StandardScaler().fit(true_train)
    train_weight_outliers = 5.0 * np.abs(
        weighter_scale.transform(true_train)) + 1
    cv_weight_outliers = 5.0 * np.abs(weighter_scale.transform(true_cv)) + 1
    test_weight_outliers = 5.0 * np.abs(
        weighter_scale.transform(test_data['log_ret'].values)) + 1

    from sklearn.metrics import mean_squared_error
    scorer = mean_squared_error

    from sklearn import dummy
    dumreg = dummy.DummyRegressor()
    dumreg.fit(stock_train, true_train)
    dumguess = dumreg.predict(sctest_arr)
    plot_error(test_data['log_ret'].values, dumguess,
               'Dumb regression test set', scorer)
    plt.savefig('logregcorr_Dummy.png')

    clf = sklearn.linear_model.LinearRegression()
    clf.fit(stock_train, true_train)
    stock_data['predictchange'] = clf.predict(sctrain_arr)
    plot_error(test_data['log_ret'].values, clf.predict(sctest_arr),
               'linear regression test set', scorer)
    plt.savefig('logregcorr_linear.png')

    print "Decision Tree"
    from sklearn import tree
        if best_i == None or \
           ( score - scores[best_i] > n_sigma * min(stderr, stderrs[best_i]) ):
            best_i = i
    return best_i


#############
"""
The simplest model: Median of *all* previous appraisals

This is also an elementary test of the pipeline setup and predictive interval wrapper.
"""

# (The selected feature here doesn't matter, use special 'dummy')
pl = Pipeline([('ColumnSelector', ColumnSelectTransformer('dummy')),
               ('Regressor', dummy.DummyRegressor(strategy='median'))])
r = PredictiveIntervalRegressor(pl,
                                n_resamplings=n_bootstrap,
                                save_models=False,
                                max_residuals=None)
r.fit([[0]] * len(df_global_train), df_global_train['LogAvg'].values)
global_median_model = {category: r for category in categories}

############
"""
Next-to-simplest model: Medians by category
"""

category_median_model = {}
for category in categories:
    pl = Pipeline([('ColumnSelector', ColumnSelectTransformer('dummy')),
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, dummy, metrics
import pandas as pd
import numpy as np
from helpers import root_mean_squared_log_error

dataset = pd.read_csv('data/train_merged.csv')

# Extract the objective values
y = dataset['trip_duration'].values
# Delete irrelevant columns in training set
del dataset['trip_duration'], dataset["id"], dataset[
    "trip_duration_in_minutes"]
X = dataset.values

means = np.nanmean(X, axis=0)
nan_locations = np.where(np.isnan(X))
X[nan_locations] = np.take(means, nan_locations[1])

# Normalize X
X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

regressor = dummy.DummyRegressor()
regressor.fit(X_train, y_train)
print("RMSLE =", root_mean_squared_log_error(y_test,
                                             regressor.predict(X_test)))
예제 #20
0
# -

auto2.plot.scatter(x='horsepower', y='city_mpg')

# very simple
lr = linear_model.LinearRegression()
lr.fit(auto2[['horsepower']], auto2.city_mpg)
lr.score(auto2[['horsepower']], auto2.city_mpg)

ax = auto2.plot.scatter(x='horsepower', y='city_mpg')
xs = np.arange(40, 200)
ax.plot(xs, xs * lr.coef_ + lr.intercept_)

# Let's use all of the columns
# Baseline model - default strategy is to always predict mean
dm = dummy.DummyRegressor()
dm.fit(auto_X, auto_y)
dm.score(auto_X, auto_y)

# Score is R2 score - coefficient of determintation
# Usually between 0-1 - .92 amount that answer is explained by features
# 1 - 100% of answer is explained by features
lr = linear_model.LinearRegression()
lr.fit(auto_X, auto_y)
lr.score(auto_X, auto_y)

pd.Series(lr.coef_, auto_X.columns)

lr.intercept_

# ## Lab Data
예제 #21
0
def fabrication_model_general(data, d_features, isRidge=False, isLasso=False):
    name = 'fabrication_model_general'

    tstamp1 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')

    # Les variables utiles
    l_numerical = d_features['l_numerical']
    l_categoriel = d_features['l_categoriel']

    # Debut
    log_info('-- DEBUT de {} : {} - {}'.format(name, 'Preparation', tstamp1))

    ## 1 - Entrainement de l'Encodage categoriel
    encoder = OneHotEncoder(sparse=True)
    encoder.fit(data[l_categoriel])

    ## JE VEUX CONSERVER TOUTES LES DATAS DANS LE RESTE
    #X_data = data[(l_numerical + l_categoriel)]
    X_data = data
    Y_data = data['ARR_DELAY']

    ## 2 - Séparation des données
    X_train, X_test, Y_train, Y_test = train_test_split(X_data,
                                                        Y_data,
                                                        test_size=0.2,
                                                        random_state=0)

    ## ICI MODIF
    X_train_bis = X_train[(l_numerical + l_categoriel)]

    ## 3 - Préparation Modélisation Générale
    ## 3_1 - Standardisation des données numériques
    scaler = StandardScaler()

    #### Entrainement
    #scaler.fit(X_train[l_numerical])
    scaler.fit(X_train_bis[l_numerical])
    #### Transformation
    #X_train_numerical = sparse.csr_matrix(scaler.transform(X_train[l_numerical]))
    X_train_numerical = sparse.csr_matrix(
        scaler.transform(X_train_bis[l_numerical]))

    ## 3_2 - Encodage des données categorielles
    #X_train_categoriel = encoder.transform(X_train[l_categoriel])
    X_train_categoriel = encoder.transform(X_train_bis[l_categoriel])

    ## 3_3 - Fabrication des données optimisées
    #print('X_train_numerical.shape = ', X_train_numerical.shape)
    #print('X_train_categoriel.shape = ', X_train_categoriel.shape)

    Opt_X_train = sparse.hstack((X_train_numerical, X_train_categoriel))
    #Opt_X_test  = sparse.hstack(X_test_numerical, X_test_categoriel)

    tstamp2 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')
    log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Preparation',
                                                     tstamp2,
                                                     tstamp2 - tstamp1))

    ## 4 - Modélisation
    ## 4_1 - Modélisation Linéaire
    log_info('-- DEBUT de {} : {} - {}'.format(name, 'Modélisation', tstamp2))

    ## REGRESSION LINEAIRE
    tstamp_lr1 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
    lr = LinearRegression()
    lr.fit(Opt_X_train, Y_train)
    tstamp_lr2 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

    ## VERSION NAIVE
    ## Regression Naive
    tstamp_dum1 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
    dum = dummy.DummyRegressor(strategy='mean')
    dum.fit(Opt_X_train, Y_train)
    tstamp_dum2 = datetime.strptime(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

    tstamp3 = datetime.strptime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')
    log_info('-- FIN de {} : {} - {} -> {}\n'.format(name, 'Modélisation',
                                                     tstamp3,
                                                     tstamp3 - tstamp2))

    ## 5 - Sauvegarde des données
    N_Data = {
        'X_train': X_train,
        'Y_train': Y_train,
        'X_test': X_test,
        'Y_test': Y_test
    }
    N_Model_Optimisation = {'OneHotEncoder': encoder, 'StandardScaler': scaler}
    N_Model = {
        'LinearRegression': {
            'Model': lr,
            'Temps': tstamp_lr2 - tstamp_lr1
        }
    }
    N_Model['Naive'] = {'Model': dum, 'Temps': tstamp_dum2 - tstamp_dum1}

    if isRidge:
        ridge = RidgeCV(fit_intercept=False, cv=3)
        tstamp_ridge1 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        ridge.fit(Opt_X_train, Y_train)
        tstamp_ridge2 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        N_Model['RidgeCV'] = {
            'Model': ridge,
            'Temps': tstamp_ridge2 - tstamp_ridge1
        }

    if isLasso:
        tstamp_lasso1 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        lasso = LassoCV(fit_intercept=False, cv=3)
        lasso.fit(Opt_X_train, Y_train)
        tstamp_lasso2 = datetime.strptime(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        N_Model['LassoCV'] = {
            'Model': lasso,
            'Temps': tstamp_lasso2 - tstamp_lasso1
        }

    return N_Data, N_Model_Optimisation, N_Model
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

## Gram matrix:
kmatrix = []
subtitles = []

## Training
score = 'neg_mean_squared_error'

## DummyClassifier
if (dum):
    print("\n===== Dummy Classifier (Baseline 1) =====")
    rgs_dum = dummy.DummyRegressor(strategy='mean')

    print("Training...")
    rgs_dum.fit(X_train_std, y_train)

    print("Prediction:")
    y_test_pred = rgs_dum.predict(X_test_std)
    rmse_dum = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    print("\tRMSE = %0.3f" % rmse_dum)

## Linear Ridge Regressor
if (lRR):
    print("\n===== Linear Ridge Regressor =====")
    param_grid = {'alpha': np.logspace(-3, 3, 7)}
    rgs_lrr = model_selection.GridSearchCV(linear_model.Ridge(),
                                           param_grid=param_grid,