예제 #1
0
def plot_validation_curve(model,
                          x_train,
                          y_train,
                          h_param,
                          h_range,
                          k=10,
                          log_scale=False,
                          scorer=None):
    train_score, val_score = validation_curve(model,
                                              x_train,
                                              y_train,
                                              h_param,
                                              h_range,
                                              cv=k,
                                              scoring=scorer)
    validation_curve_data = {'train': train_score, 'test': val_score}
    for legend, scores in validation_curve_data.items():
        if log_scale:
            plt.semilogx(h_range, np.abs(scores.mean(axis=1)), label=legend)
        else:
            plt.plot(h_range, np.abs(scores.mean(axis=1)), label=legend)
    plt.xlabel(h_param, labelpad=20)
    plt.ylabel('score', labelpad=20)
    plt.legend()
    plt.show()
예제 #2
0
def plot_learning_curve(model,
                        x_train,
                        y_train,
                        train_sizes_ratio=np.linspace(0.1, 1.0, 10),
                        k=10,
                        scorer=None):
    """

    :param model:
    :param x_train:
    :param y_train:
    :param train_sizes_ratio:
    :param k:
    :param scorer:
    :return:
    """
    N, train_score, val_score = learning_curve(model,
                                               x_train,
                                               y_train,
                                               train_sizes=train_sizes_ratio,
                                               cv=k,
                                               scoring=scorer)
    learning_curve_data = {'train': train_score, 'test': val_score}
    for legend, scores in learning_curve_data.items():
        plt.plot(N, np.abs(scores.mean(axis=1)), label=legend)
    plt.xlabel('train_sizes', labelpad=20)
    plt.legend()
    plt.show()
예제 #3
0
def train_gridsearch(data, model, param_grid, metric, k=10, p=3, v=True):
    # Model name
    model_label = model_name(model)
    # Get training & testing data
    x_train, y_train = data['train']
    x_test, y_test = data['test']
    # Define refit condition (first metric if evaluationg multiple metrics else False)
    refit_cond = metric[0] if type(metric) is list else True
    # Build grid search
    gridsearch = GridSearchCV(model,
                              param_grid,
                              cv=k,
                              scoring=metric,
                              refit=refit_cond)
    # Time the model training
    start_training = datetime.datetime.now()
    # Train model with grid search
    gridsearch.fit(x_train, y_train)
    end_training = datetime.datetime.now()
    # Compute training time
    training_time = end_training - start_training
    # Format training time
    training_time_str = format_run_time(training_time)
    # Trained_model
    trained_model = gridsearch.best_estimator_
    # Get scores from cross validation
    cv_scores = {}
    if type(metric) is list:
        for scorer_label in metric:
            if scorer_label.startswith('neg'):
                formatted_label = "".join([
                    w[0] for w in scorer_label.replace('neg_', '').split('_')
                ])
                formatted_score = round(
                    np.abs(gridsearch.cv_results_[f'mean_test_{scorer_label}'])
                    [0], p)
                cv_scores[formatted_label] = formatted_score
            else:
                cv_scores[scorer_label] = round(
                    gridsearch.cv_results_[f'mean_test_{scorer_label}'][0], p)
    else:
        cv_scores[metric] = round(
            gridsearch.cv_results_[f'mean_test_score'][0], p)
    # Get scores from testing set
    testing_set_scores = get_model_scores(trained_model, x_test, y_test,
                                          list(cv_scores.keys()), p, v)
    # Display cross validation mean scores
    if v:
        print_score_results(cv_scores, set_type='train')
    # Build model dictionary which contains GridSearchCV & model instances (with model name)
    model_data = {
        'gs': gridsearch,  # GridSearchCV trained instance
        'model': trained_model,  # Model trained instance
        'model_name': model_label
    }  # Model name
    # Build additional evaluation data dictionary
    additional_evaluation_data = {
        'time': training_time_str,  # Training time
        'n_features': x_train.shape[1],  # Selected features
        'learning_potential': None
    }  # Learning potential
    # Build results dictionary (merge dictionnaries)
    results = dict(**model_data, **testing_set_scores,
                   **additional_evaluation_data)
    return results
예제 #4
0
 def plot_factorial_planes(self,
                           n_plan=None,
                           X_projected=None,
                           labels=None,
                           alpha=1,
                           illustrative_var=None,
                           illustrative_var_title=None,
                           save_as_img=False,
                           plot_size=(10, 8)):
     """
     :param: axis_nb: the total number of axes to display (default is kaiser criterion divided by 2)
     """
     X_projected = self.X_projected if X_projected is None else X_projected
     factorial_plan_nb = self.default_factorial_plan_nb if n_plan is None else n_plan
     axis_ranks = [(x, x + 1) for x in range(0, factorial_plan_nb, 2)]
     for d1, d2 in axis_ranks:
         if d2 < self.n_comp:
             fig = plt.figure(figsize=plot_size)
             # Display data points
             if illustrative_var is None:
                 plt.scatter(X_projected[:, d1],
                             X_projected[:, d2],
                             alpha=alpha)
             else:
                 illustrative_var = np.array(illustrative_var)
                 for value in np.unique(illustrative_var):
                     selected = np.where(illustrative_var == value)
                     plt.scatter(X_projected[selected, d1],
                                 X_projected[selected, d2],
                                 alpha=alpha,
                                 label=value)
                 plt.legend(title=illustrative_var_title
                            if illustrative_var_title is not None else None)
             # Display data points labels
             if labels is not None:
                 for i, (x, y) in enumerate(X_projected[:, [d1, d2]]):
                     plt.text(x,
                              y,
                              labels[i],
                              fontsize='12',
                              ha='center',
                              va='bottom')
                     # Fix factorial plan limits
             boundary = np.max(np.abs(X_projected[:, [d1, d2]])) * 1.1
             plt.xlim([-boundary, boundary])
             plt.ylim([-boundary, boundary])
             # Display horizontal & vertical lines
             plt.plot([-100, 100], [0, 0], color='grey', ls='--')
             plt.plot([0, 0], [-100, 100], color='grey', ls='--')
             # Axes labels with % explained variance
             plt.xlabel('F{} ({}%)'.format(d1 + 1,
                                           round(100 * self.evr[d1], 1)),
                        labelpad=20)
             plt.ylabel('F{} ({}%)'.format(d2 + 1,
                                           round(100 * self.evr[d2], 1)),
                        labelpad=20)
             plt.title("Projection des individus (sur F{} et F{})".format(
                 d1 + 1, d2 + 1),
                       pad=20)
             if save_as_img:
                 plt.tight_layout()
                 plt.savefig(
                     'factorial_plan_{}.jpg'.format(1 if d1 == 0 else d1))
             plt.show(block=False)