示例#1
0
def get_grid_scores(scores, parameters, n_samples, n_folds, iid):
    score_params_len = list(zip(scores, parameters, n_samples))
    n_fits = len(score_params_len)

    scores = []
    grid_scores = []
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, parameters, this_n_test_samples in \
                score_params_len[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))
    return grid_scores
        def fit_func(**params):
            params = apply_transforms(params, transforms)
            base_id = len(cv_scores) * len(cv)

            scores = PSOSearch.cross_val_score(base_index=base_id,
                                               estimator=est,
                                               parameters=params,
                                               loader=loader,
                                               cv=cv,
                                               scorer=scorer,
                                               fit_callback=fit_callback,
                                               cacher=cacher,
                                               callback=callback,
                                               mapper=mapper)

            cv_score = _CVScoreTuple(params, np.mean(scores), scores)
            cv_scores[base_id] = cv_score
            best_score_params = cv_scores.values()[np.argmax(
                np.array(
                    map(lambda score: score.mean_validation_score,
                        cv_scores.itervalues())))]
            best_score_mean = best_score_params.mean_validation_score
            best_score_std = np.std(best_score_params.cv_validation_scores)
            if callback:
                callback(description='%.3f+-%.3f' %
                         (best_score_mean, best_score_std))
            return scores.mean()
示例#3
0
def pick_best_parameters(score_len_params, n_folds, iid):
    n_fits = len(score_len_params)

    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, parameters in \
                score_len_params[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    return best
示例#4
0
        def func(x):
            parameters = self._list_to_grid_point(x, parameter_iterable)

            n_test_samples = 0
            score = 0
            all_scores = []

            for train, test in cv:
                this_score, this_n_test_samples, _, parameters = \
                        _fit_and_score(clone(base_estimator), X, y, self.scorer_,
                                       train, test, self.verbose, parameters,
                                       self.fit_params, return_parameters=True,
                                       error_score=self.error_score)
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score

            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)

            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))

            #print 'In func:', x, score
            return score
示例#5
0
def pick_best_parameters(score_len_params, n_folds, iid):
    n_fits = len(score_len_params)

    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, parameters in \
                score_len_params[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        grid_scores.append(_CVScoreTuple(
            parameters,
            score,
            np.array(all_scores)))

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    return best
示例#6
0
    def grid_scores_(self):
        grid_scores = list()

        scores_per_config = defaultdict(list)
        config_list = list()

        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]

            config_id = run_key.config_id
            cost = run_value.cost

            if config_id not in config_list:
                config_list.append(config_id)

            scores_per_config[config_id].append(cost)

        for config_id in config_list:
            scores = [1 - score for score in scores_per_config[config_id]]
            mean_score = np.mean(scores)
            config = self.runhistory_.ids_config[config_id]

            grid_score = _CVScoreTuple(config.get_dictionary(), mean_score,
                                       scores)
            grid_scores.append(grid_score)

        return grid_scores
    def fit_ipp(self, X, y, grid):
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        if self.grid_parallel:
            scores, grid_scores = grid_cv_scores(self.estimator, X, y, grid, self.scoring, self.cv,
                    self.profile, self.n_jobs, self.verbose)
        else:
            scores = []
            # grid =  grid_search.ParameterGrid(self.param_grid)
            grid_scores = [];
            for parameters in grid:
                self.estimator.set_params(**parameters)
                scores_cv = cross_val_score(self.estimator, X, y, self.scoring, self.cv, profile=self.profile)
                scores.append(np.array(scores_cv).mean())
                grid_scores.append(grid_search._CVScoreTuple(
                    parameters,
                    scores_cv.mean(),
                    scores_cv))

        max_idx = np.array(scores).argmax()
        self.best_estimator_ = self.estimator.set_params(**list(grid)[max_idx])
        self.best_params_ = list(grid)[max_idx]
        self.scores_ = scores
        self.best_score_ = np.array(scores).max()
        self.grid_scores_ = grid_scores

        if self.refit:
            self.best_estimator_.fit(X, y)

        return self
def grid_cv_scores(estimator, X, y, grid, scoring=None, cv=10, profile='net', n_jobs=-1, verbose=None):
    input_sets = []
    if type(cv) == int:
        cv = cross_validation.KFold(len(X), n_folds=cv)
    for parameters in grid:
        estimator1 = clone(estimator)
        input_sets.append({
            'estimator':estimator1.set_params(**parameters),
            'X':X,
            'y':y,
            'scoring':scoring,
            'cv':cv
            })
    dview = random_rc(profile=profile, n_jobs=n_jobs, n_executed_jobs=len(input_sets))
    results = dview.map_sync(scores_out, input_sets)
    dview.client.close()
    scores = []
    grid_scores = []
    for ii in range(len(grid)):
        scores.append(results[ii].mean())
        grid_scores.append(grid_search._CVScoreTuple(
            list(grid)[ii],
            results[ii].mean(),
            results[ii]))
    return scores, grid_scores
示例#9
0
    def _fit(self, rdd, labeler, parameter_iterable):
        if self.n_duplicates == 1:
            self.partitioner = FlySplit(self.n_splits)
        else:
            self.partitioner = FlyDuplicate(self.n_duplicates)

        rdd = self.partitioner.prepare_rdd(rdd)
        base_estimator = clone(self.estimator)
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        rdd_partition_num = rdd.get_partitions()
        all_params = list(parameter_iterable)
        parameters = even_split(all_params, rdd_partition_num)

        out = rdd.mapPartitionsWithIndex(
            self._fit_partitions(labeler, parameters)).collect()
        # Out is a list of triplet: score, parameters, n_test_samples

        out = filter(None, out)
        out.sort(key=lambda x: all_params.index(x[1]))
        n_fits = len(out)
        n_folds = self.cv or 3

        scores = list()
        grid_scores = list()
        for grid_start in xrange(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        best_estimator = clone(base_estimator).set_params(**best.parameters)
        self.best_estimator_ = best_estimator

        return self
def cached_results_to_cvscores(cache_results):
    results_by_params = defaultdict(list)
    for score, test_set_size, time, params in cache_results:
        results_by_params[frozenset(map(map_param,
                                        params.iteritems()))].append(score)

    grid_results = []
    for set_params, all_scores in results_by_params.iteritems():
        grid_results.append(
            _CVScoreTuple(dict(set_params), np.mean(all_scores),
                          np.array(all_scores)))
    return grid_results
示例#11
0
 def __call__(self, params):
     p = dict()
     invalid_parameters = False
     failed_power = 3.0
     for i, name in enumerate(self.parameterNames):
         p[name] = params[i]
         if self.parameters_restrictions and name in self.parameters_restrictions:
             if not self.parameters_restrictions[name](params[i]):
                 invalid_parameters = True
                 failed_power *= params[i]
     p.update(self.defaultParameters)
     print p
     if invalid_parameters:
         print 'invalid: ', -abs(failed_power)
         return -abs(failed_power)
     clf = self.classifierFactory(**p)
     scores = cross_validation.cross_val_score(clf, self.trainData, self.trainLabel, cv=self.cv, scoring='f1')
     print scores.mean()
     #todo: think about putting all scores to array/list
     self.grid_scores_.append(_CVScoreTuple(p, scores.mean(), scores))
     if self.best_score_ is None or self.best_score_ < scores.mean():
         self.best_score_ = scores.mean()
         self.best_params_ = p
     return scores.mean()
示例#12
0
    def fit(self, X, y=None, x_is_index=False):
        """
        fit creates a task for every pair of folds and combination of hyperparameters in the grid
        it then distributes the tasks to ipyparallel view and waits for completion
        :param X: ndarray of data
        :param y: ndarray of target variables
        :param x_is_index: boolean variable to indicate that X is not the data itself,
            but the index of the data to be used on remote machines.
            Useful when sending the data by network is unfeasible
        """

        if not self.loader:
            self.loader = lambda: (X, y)

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        if x_is_index and self.loader is None:
            raise ValueError('no loader given')

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        train_test_parameters = ((train, test, apply_transforms(parameters, self.transforms)) \
                                 for parameters in parameter_iterable
                                 for train, test in cv)

        length = len(parameter_iterable) * len(cv)
        if self.callback:
            self.callback(len(self.cacher), length)

        if x_is_index:
            X_to_pass = X
            y_to_pass = y if self.loader is None else None
        else:
            if self.loader is not None:
                X_to_pass = None
                y_to_pass = None
            else:
                X_to_pass = X
                y_to_pass = y

        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(GridSearchCVParallel.my_fit_and_score,
                    estimator=base_estimator,
                    X=X_to_pass,
                    y=y_to_pass,
                    fit_params=self.fit_params,
                    scorer=self.scorer_,
                    x_is_index=x_is_index,
                    loader=self.loader,
                    fit_callback=self.fit_callback)

        iterable = itertools.ifilter(lambda (i, ttp): i not in self.cacher,
                                     enumerate(train_test_parameters))

        results_by_params = defaultdict(list)
        for id, result in self.cacher.iteritems():
            score, test_size, time, params = result
            results_by_params[frozenset(map(map_param,
                                            params.iteritems()))].append(score)

        try:
            for index, result in self.mapper(f, iterable):
                self.cacher[index] = result
                score, test_size, time, params = result
                results_by_params[frozenset(map(
                    map_param, params.iteritems()))].append(score)
                if self.callback:
                    best_scores = next(
                        iter(
                            sorted(itertools.ifilter(
                                lambda scores: len(scores) == len(cv),
                                results_by_params.values()),
                                   key=lambda scores: np.mean(scores),
                                   reverse=True)), [0])

                    self.callback(1,
                                  length,
                                  description='%.3f+-%.3f' %
                                  (np.mean(best_scores), np.std(best_scores)))
        except Exception as e:
            print(e)
            e_type, e_value, e_tb = sys.exc_info()
            traceback.print_tb(e_tb)

        # assert len(self.cacher) == length and (np.array(self.cacher.keys()) == np.arange(length)).all()

        # out = self.cacher.values()
        #
        # # Out is a list of triplet: score, estimator, n_test_samples
        # n_fits = len(out)
        # n_folds = len(cv)
        #
        # scores = list()
        # grid_scores = list()
        # for grid_start in range(0, n_fits, n_folds):
        #     n_test_samples = 0
        #     score = 0
        #     all_scores = []
        #     for this_score, this_n_test_samples, _, parameters in \
        #             out[grid_start:grid_start + n_folds]:
        #         all_scores.append(this_score)
        #         if self.iid:
        #             this_score *= this_n_test_samples
        #             n_test_samples += this_n_test_samples
        #         score += this_score
        #     if self.iid:
        #         score /= float(n_test_samples)
        #     else:
        #         score /= float(n_folds)
        #     scores.append((score, parameters))
        #     # TODO: shall we also store the test_fold_sizes?
        #     grid_scores.append(_CVScoreTuple(
        #         parameters,
        #         score,
        #         np.array(all_scores)))

        grid_scores = []
        for set_params, all_scores in results_by_params.iteritems():
            grid_scores.append(
                _CVScoreTuple(dict(set_params), np.mean(all_scores),
                              np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        print(len(grid_scores))

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#13
0
    def _fit(self, Z, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        cv = self.cv
        cv = _check_cv(cv, Z)

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch, backend="threading"
        )(
            delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(Z, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#14
0
            ''' prediction '''
            clf.fit(Xtr, ytr)
            yhat = clf.predict_proba(Xts)
            # print yhat.shape
            auc = compute_roc_auc_score_label_safe(yts, yhat[:, 1])
            auc_cv[i] = auc
            auc_cv_rk[i, r] = auc

            # print 'iter: ', i, ' auc= ', auc

        # print param, auc_cv.mean(), auc_cv.ravel()
        # print res_selected_cv
        # print [s for i, s in enumerate(aFeatNames) if res_selected_cv[i] > 0]
        # print 'selected features: ', sum(res_selected_cv > 0)
        grid_scores.append(
            _CVScoreTuple(param, auc_cv.mean(), np.array(auc_cv)))
        # print param, auc_cv.mean(), auc_cv.std()

    # print grid_scores
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    best_params_ = best.parameters
    best_score_ = best.mean_validation_score

    # grid_search = GridSearchCV(pipe, parameters, verbose=1, n_jobs=2, cv=k_feat)
    # grid_search = GridSearchCV(clf, parameters, verbose=1, cv=cv, scoring='roc_auc')
    # grid_search.fit(Xt, yt)

    # print('Best features:', grid_search.best_estimator_.steps[0][1].k_feature_idx_)
    print("Best score: %0.3f" % best_score_)
示例#15
0
    def _fit(self, depthmaps, offset_points_projected, direction_vectors, true_joints, parameter_iterable):

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(depthmaps)
        
        if _num_samples(offset_points_projected) != n_samples:
            raise ValueError('offset_points_projected has a different number '
                                'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(offset_points_projected), n_samples))
        
        if _num_samples(direction_vectors) != n_samples:
            raise ValueError('direction_vectors has a different number '
                                'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(direction_vectors), n_samples))
        
        cv = _check_cv(cv, n_samples)
            
        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv)))
                      
        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(_fit_and_score)(clone(base_estimator), depthmaps, offset_points_projected,
                                    direction_vectors, true_joints, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
                for parameters in parameter_iterable
                for train, test in cv)
        
        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(depthmaps, offset_points_projected, direction_vectors, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#16
0
def _fit(self, X, y, parameter_iterable, en_celery=False):
  """Actual fitting,  performing the search over parameters."""

  estimator = self.estimator
  cv = self.cv
  self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

  n_samples = _num_samples(X)
  X, y = indexable(X, y)

  if y is not None:
      if len(y) != n_samples:
          raise ValueError('Target variable (y) has a different number '
                           'of samples (%i) than data (X: %i samples)'
                           % (len(y), n_samples))
  cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

  if self.verbose > 0:
      if isinstance(parameter_iterable, Sized):
          n_candidates = len(parameter_iterable)
          print("Fitting {0} folds for each of {1} candidates, totalling"
                " {2} fits".format(len(cv), n_candidates,
                                   n_candidates * len(cv)))

  base_estimator = clone(self.estimator)

  pre_dispatch = self.pre_dispatch

  if en_celery:
    out = []
    timestamp = timestamp = datetime.now().strftime("%Y%m%d%H%M%s")
    key = "sample_%s_%s" % (timestamp, int(round(random.random(), 8)*1e8))
    red.set(key, pickle.dumps({'X': X, 'y': y}))
    grp = group(cjobs.fas_mp.s(clone(base_estimator), key, self.scorer_,
                                train, test, self.verbose, parameters,
                                self.fit_params, return_parameters=True,
                                error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)()
    out = grp.get()
    red.delete(key)
  else:
    out = Parallel(
        n_jobs=self.n_jobs, verbose=self.verbose,
        pre_dispatch=pre_dispatch
    )(
        delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                train, test, self.verbose, parameters,
                                self.fit_params, return_parameters=True,
                                error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)

  # Out is a list of triplet: score, estimator, n_test_samples
  n_fits = len(out)
  n_folds = len(cv)

  scores = list()
  grid_scores = list()
  for grid_start in range(0, n_fits, n_folds):
      n_test_samples = 0
      score = 0
      all_scores = []
      for this_score, this_n_test_samples, _, parameters in \
              out[grid_start:grid_start + n_folds]:
          all_scores.append(this_score)
          if self.iid:
              this_score *= this_n_test_samples
              n_test_samples += this_n_test_samples
          score += this_score
      if self.iid:
          score /= float(n_test_samples)
      else:
          score /= float(n_folds)
      scores.append((score, parameters))
      # TODO: shall we also store the test_fold_sizes?
      grid_scores.append(_CVScoreTuple(
          parameters,
          score,
          np.array(all_scores)))
  # Store the computed scores
  self.grid_scores_ = grid_scores

  # Find the best parameters by comparing on the mean validation score:
  # note that `sorted` is deterministic in the way it breaks ties
  best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                reverse=True)[0]
  self.best_params_ = best.parameters
  self.best_score_ = best.mean_validation_score

  if self.refit:
      # fit the best estimator using the entire dataset
      # clone first to work around broken estimators
      best_estimator = clone(base_estimator).set_params(
          **best.parameters)
      if y is not None:
          best_estimator.fit(X, y, **self.fit_params)
      else:
          best_estimator.fit(X, **self.fit_params)
      self.best_estimator_ = best_estimator
  return self
示例#17
0
    def _extendedFit(self, X, y, parameter_iterable):
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print(
                    "Fitting {0} folds for each of {1} candidates, totalling"
                    " {2} fits".format(len(cv), n_candidates,
                                       n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(_extended_fit_and_score)(clone(base_estimator),
                                                 X,
                                                 y,
                                                 self.scorer_,
                                                 train,
                                                 test,
                                                 self.verbose,
                                                 parameters,
                                                 self.fit_params,
                                                 return_parameters=True,
                                                 error_score=self.error_score)
                for parameters in parameter_iterable for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        grid_extras = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = []
            for this_score, this_n_test_samples, _, parameters, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            print "Refitting best estimator"
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#18
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
                      for parameters in parameter_iterable
                      for (train, test) in cv]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        fit_params = self.fit_params
        error_score = self.error_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                                  parameters, fit_params,
                                  return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#19
0
def _fit(self, X, y, parameter_iterable):
    """Actual fitting,  performing the search over parameters."""
    estimator = self.estimator
    foldsForEstimator = {}
    cv = self.cv

    self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))

    # Splits the data based on provided cross-validation splitting strategy.
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    if self.verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling \
                {2} fits".format(len(cv), n_candidates,
                                 n_candidates * len(cv)))

    base_estimator = clone(self.estimator)

    pre_dispatch = self.pre_dispatch

    # Change from original scikit code: adding a new argument,
    # foldsForEstimator, to the _fit_and_score function to track metadata
    # for each estimator, for each fold.
    # _fit_and_score fits the estimator and computes the score for a given
    # data-split, for given parameters.
    out = Parallel(n_jobs=self.n_jobs,
                   verbose=self.verbose,
                   pre_dispatch=pre_dispatch)(
                       delayed(_fit_and_score)(clone(base_estimator),
                                               X,
                                               y,
                                               self.scorer_,
                                               train,
                                               test,
                                               self.verbose,
                                               parameters,
                                               self.fit_params,
                                               foldsForEstimator,
                                               return_parameters=True,
                                               error_score=self.error_score)
                       for parameters in parameter_iterable
                       for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    # Computes the scores for each of the folds, for all the possible
    # parameters, and stores them in grid_scores.
    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in out[
                grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if self.iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if self.iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))
    # Store the computed scores
    self.grid_scores_ = grid_scores

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    self.best_params_ = best.parameters
    self.best_score_ = best.mean_validation_score

    if self.refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)
        if y is not None:
            best_estimator.fit(X, y, **self.fit_params)
        else:
            best_estimator.fit(X, **self.fit_params)
        self.best_estimator_ = best_estimator
    else:
        # If refit is false, we cannot _best_estimator_ is unavailable, and
        # further predictions can't be made on instance
        raise Warning(
            "Note: Refit has been set to false, which makes it impossible to "
            "make predictions using this GridSearchCV instance after fitting. "
            "Change refit to true to enable this")

    # Change from original scikit code:
    # Populate new field with necessary attributes for storing
    # cross-validation event
    self.grid_cv_event = [
        X, foldsForEstimator, 0,
        type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds
    ]
    return self
                    param=param,
                    key=key,
                    y_hat_valid_precomputed=d_yhat_valid_to_save_computation,
                    auc_valid_precomputed=d_auc_valid_to_save_computation)

                clf_param_key = create_unique_key_param_patient(key, param)
                d_auc_valid_to_save_computation[clf_param_key] = out[0]
                d_yhat_valid_to_save_computation[clf_param_key] = out[1]
                auc_single_cv[:, key - 1] = out[0]
            ''' 2) add together prediction from each patient and compute AUC across all patients '''
            auc_all_p = compute_auc_cv_for_all_p(
                d_data_train, d_yhat_valid_to_save_computation, Kinner, param,
                prob_calib_alg)

            gs_1.append(
                _CVScoreTuple(param, auc_single_cv[:, 0].mean(),
                              np.array(auc_single_cv[:, 0])))
            gs_2.append(
                _CVScoreTuple(param, auc_single_cv[:, 1].mean(),
                              np.array(auc_single_cv[:, 1])))
            gs_3.append(
                _CVScoreTuple(param, auc_single_cv[:, 2].mean(),
                              np.array(auc_single_cv[:, 2])))
            grid_scores_all.append(
                _CVScoreTuple(param, auc_all_p.mean(), np.array(auc_all_p)))

        best = sorted(grid_scores_all,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        best_params_ = best.parameters
        best_score_ = best.mean_validation_score
    def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'):

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)


        # out = Parallel(
        #     n_jobs=self.n_jobs, verbose=self.verbose,
        #     pre_dispatch=pre_dispatch
        # )(
        #     delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
        #                             train, test, self.verbose, parameters,
        #                             self.fit_params, return_parameters=True,
        #                             error_score=self.error_score)
        #         for parameters in parameter_iterable
        #         for train, test in cv)

        train_test_parameters = ((train, test, parameters) \
                                 for parameters in parameter_iterable for train, test in cv)

        length = len(parameter_iterable) * len(cv)

        if x_is_index:
            X_to_pass = X
            y_to_pass = None
        else:
            X_to_pass = None
            y_to_pass = None

        self.view.block = False
        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(my_fit_and_score, estimator=clone(base_estimator),
                    X=X_to_pass,
                    y=y_to_pass,
                    verbose=self.verbose,
                    fit_params=self.fit_params,
                    return_parameters=True,
                    scorer=None,
                    x_is_index=x_is_index,
                    names=(X_name, y_name))

        # print('before map')

        # import cProfile
        #
        # pr = cProfile.Profile()
        # pr.enable()
        chunksize = 10

        out = self.view.map(f, itertools.islice(train_test_parameters, 0, length),
                            ordered=False,
                            block=False,
                            chunksize=chunksize)  # length / len(self.view))
        # pr.disable()
        # pr.print_stats('cumulative')
        print('map called')
        if self.callback is not None:
            old_progress = out.progress
            while not out.ready():
                self.callback(out.progress * chunksize, length, out.elapsed)
                if old_progress == out.progress and out.progress > 0:
                    for id, info in self.view.queue_status(verbose=True).iteritems():
                        # print(id, info)
                        if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0:
                            print(id, info['queue'])

                    pass
                old_progress = out.progress
                sleep(10)
        print('map ready')
        out = out.get()


        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#22
0
                    Xtr = pipe.fit_transform(Xtr, ytr)
                    Xts = pipe.transform(Xts)
                    # print 'selected features: ', Xtr.shape

                    clf.set_params(**param)
                    clf.fit(Xtr, ytr, eval_metric='auc')

                    ''' prediction '''
                    yhat = clf.predict_proba(Xts)
                    auc = metrics.roc_auc_score(yts, yhat[:, 1])
                    auc_cv[jj] = auc
                    # auc_cv[jj] = auc if auc > 0.5 else 1 - auc

                # print param, auc_cv.mean(), auc_cv.ravel()
                grid_scores.append(_CVScoreTuple(param, auc_cv.mean(), np.array(auc_cv)))

            ''' Inner loop results '''
            print("Inner loop: best parameters set:")
            best = sorted(grid_scores, key=lambda x: x.mean_validation_score,reverse=True)[0]
            best_params_ = best.parameters
            best_score_ = best.mean_validation_score

            print("Best score: %0.3f" % best_score_)
            print("Best parameters set:")
            for param_name in sorted(parameters.keys()):
                print("\t%s: %r" % (param_name, best_params_[param_name]))

            # print 'Grid scores'
            for score in grid_scores:
                # print score
示例#23
0
            #     pdf_tr = gaussian_kde(d_yhat_for_dkl_computation[ip].ravel())
            #     pdf_ts = gaussian_kde(yts_unlabelled[:, 1])
            #     x = np.linspace(0, 1, 100)
            #
            #     en = entropy(pdf_tr(x), pdf_ts(x))
            #     kl[ip] = en

            # print auc_single_cv
            # print auc_all_cv
            if verbose:
                print kl
                print auc_single_cv.mean(axis=0)
                print auc_all_cv.mean()

            # print param, auc_cv.mean(), auc_cv.ravel()
            gs_1.append(_CVScoreTuple(param, auc_single_cv[:, 0].mean(), np.array(auc_single_cv[:, 0])))
            gs_2.append(_CVScoreTuple(param, auc_single_cv[:, 1].mean(), np.array(auc_single_cv[:, 1])))
            gs_3.append(_CVScoreTuple(param, auc_single_cv[:, 2].mean(), np.array(auc_single_cv[:, 2])))
            grid_scores_all.append(_CVScoreTuple(param, auc_all_cv.mean(), np.array(auc_all_cv)))
            # dkl.append(kl)

        # print grid_scores
        best = sorted(grid_scores_all, key=lambda x: x.mean_validation_score, reverse=True)[0]
        best_params_ = best.parameters
        best_score_ = best.mean_validation_score

        # grid_search = GridSearchCV(pipe, parameters, verbose=1, n_jobs=2, cv=k_feat)
        # grid_search = GridSearchCV(clf, parameters, verbose=1, cv=cv, scoring='roc_auc')
        # grid_search.fit(Xt, yt)

        # print('Best features:', grid_search.best_estimator_.steps[0][1].k_feature_idx_)
示例#24
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y = indexable(X, y)

        cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator))

        base_estimator = clone(self.estimator)
        out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train,
                              test, self.verbose, parameters, self.fit_params,
                              return_parameters=True,
                              error_score=self.error_score)
               for parameters in parameter_iterable
               for train, test in cv]
        self._dask_value = value(out)

        out, = compute(value(out))
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#25
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func,
                                                       self.score_func,
                                                       self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(n_jobs=self.n_jobs,
                       verbose=self.verbose,
                       pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                           X, y, base_estimator, parameters, train, test,
                           self.scorer_, self.verbose, **{
                               'sample_weight': balance_weights(y[train])
                           }) for parameters in parameter_iterable
                                                  for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X,
                                   y,
                                   sample_weight=balance_weights(y),
                                   **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#26
0
文件: build.py 项目: tgadf/pymva
def grid_search_early_stopping(estimator,
                               param_grid,
                               verbose,
                               scoring,
                               cv,
                               X,
                               y,
                               early_stopping_rounds,
                               eval_set_size,
                               n_jobs=1,
                               iid=True,
                               refit=True,
                               pre_dispatch='2*n_jobs',
                               error_score='raise'):
    ''' This is from scikit-learn package.
    '''

    parameter_iterable = ParameterGrid(param_grid)
    scorer_ = check_scoring(estimator, scoring=scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    if verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(len(cv), n_candidates,
                                     n_candidates * len(cv)))

    base_estimator = clone(estimator)

    pre_dispatch = pre_dispatch

    out = Parallel(
        n_jobs=n_jobs,
        verbose=2 if verbose > 0 else 0,
        pre_dispatch=pre_dispatch)(delayed(_fit_and_score)(
            clone(base_estimator),
            X,
            y,
            scorer_,
            train,
            test,
            2 if verbose > 0 else 0,
            parameters, {
                "early_stopping_rounds": early_stopping_rounds,
                "eval_metric": get_xgboost_eval_metric(scoring),
                "eval_set": [_safe_split(estimator, X, y, test, train)],
                "verbose": True if verbose > 1 else False
            },
            return_parameters=True,
            error_score=error_score) for parameters in parameter_iterable
                                   for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in \
                out[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    best_score_ = best.mean_validation_score

    if refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)

        if y is not None:
            best_estimator, _, _ = fit_estimator_early_stopping(
                best_estimator, X, y, scoring, early_stopping_rounds,
                eval_set_size, verbose)
        else:
            raise ValueError('y is required.')

    return best_estimator, best.parameters, grid_scores
示例#27
0
    def _fit(self, Z, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        cv = self.cv
        cv = _check_cv(cv, Z)

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch, backend="threading"
        )(
            delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(Z, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#28
0
    def _fit(self, X, y, sample_weight, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y, sample_weight = check_arrays(X,
                                           y,
                                           sample_weight,
                                           allow_lists=True,
                                           sparse_format='csr')

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)

        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print(
                    "Fitting {0} folds for each of {1} candidates, totalling"
                    " {2} fits".format(len(cv), n_candidates,
                                       n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        # first fit at each grid point using the maximum n_estimators
        param_grid = self.param_grid.copy()
        param_grid['n_estimators'] = [self.max_n_estimators]
        grid = ParameterGrid(param_grid)

        pre_dispatch = self.pre_dispatch

        clfs = Parallel(n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                            base_estimator, clf_params, X, y, sample_weight,
                            train, test, self.verbose, **self.fit_params)
                                                   for clf_params in grid
                                                   for train, test in cv)

        # now use the already fitted ensembles but trancate to N estimators for
        # N from 1 to n_estimators_max - 1 (inclusive)
        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(score_each_boost)
                (clf, clf_params, self.min_n_estimators, X, y, sample_weight,
                 self.score_func, train, test, self.verbose)
                for clf, clf_params, train, test in clfs)

        out = reduce(operator.add, [zip(*stage) for stage in out])
        # out is now a list of triplet: score, estimator_params, n_test_samples

        n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1
        n_fits = len(out)
        n_folds = len(cv)

        grid_scores = list()
        for block in range(0, n_fits, n_folds * n_estimators_points):
            for grid_start in range(block, block + n_estimators_points):
                n_test_samples = 0
                score = 0
                all_scores = list()
                for this_score, parameters, this_n_test_samples in \
                        out[grid_start:
                            grid_start + n_folds * n_estimators_points:
                            n_estimators_points]:
                    all_scores.append(this_score)
                    if self.iid:
                        this_score *= this_n_test_samples
                    score += this_score
                    n_test_samples += this_n_test_samples
                if self.iid:
                    score /= float(n_test_samples)
                else:
                    score /= float(n_folds)
                grid_scores.append(
                    _CVScoreTuple(parameters, score, np.array(all_scores)))

        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            fit_params = self.fit_params
            if sample_weight is not None:
                fit_params = fit_params.copy()
                fit_params['sample_weight'] = sample_weight
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#29
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(
            self.loss_func, self.score_func, self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(fit_grid_point_extended)(
                    X, y, base_estimator, parameters, train, test,
                    self.scorer_, self.verbose, **self.fit_params)
                for parameters in parameter_iterable
                for train, test in cv)
        
#         out = []
#         for parameters in parameter_iterable:
#             fold = 1
#             for train, test in cv:
#                 print "Processing fold", fold, self.fit_params
#                 out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params))
#                 fold += 1

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_extras = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = list()
            for this_score, parameters, this_n_test_samples, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#30
0
    def _fit(self, X, y, sample_weight, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y, sample_weight = check_arrays(X, y, sample_weight,
                                           allow_lists=True,
                                           sparse_format='csr')

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)

        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        # first fit at each grid point using the maximum n_estimators
        param_grid = self.param_grid.copy()
        param_grid['n_estimators'] = [self.max_n_estimators]
        grid = ParameterGrid(param_grid)

        pre_dispatch = self.pre_dispatch

        clfs = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(fit_grid_point)(base_estimator, clf_params,
                                    X, y, sample_weight,
                                    train, test,
                                    self.verbose, **self.fit_params)
            for clf_params in grid
            for train, test in cv)

        # now use the already fitted ensembles but trancate to N estimators for
        # N from 1 to n_estimators_max - 1 (inclusive)
        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(score_each_boost)(clf, clf_params,
                                      self.min_n_estimators,
                                      X, y, sample_weight,
                                      self.score_func,
                                      train, test,
                                      self.verbose)
            for clf, clf_params, train, test in clfs)

        out = reduce(operator.add, [zip(*stage) for stage in out])
        # out is now a list of triplet: score, estimator_params, n_test_samples

        n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1
        n_fits = len(out)
        n_folds = len(cv)

        grid_scores = list()
        for block in range(0, n_fits, n_folds * n_estimators_points):
            for grid_start in range(block, block + n_estimators_points):
                n_test_samples = 0
                score = 0
                all_scores = list()
                for this_score, parameters, this_n_test_samples in \
                        out[grid_start:
                            grid_start + n_folds * n_estimators_points:
                            n_estimators_points]:
                    all_scores.append(this_score)
                    if self.iid:
                        this_score *= this_n_test_samples
                    score += this_score
                    n_test_samples += this_n_test_samples
                if self.iid:
                    score /= float(n_test_samples)
                else:
                    score /= float(n_folds)
                grid_scores.append(_CVScoreTuple(
                    parameters,
                    score,
                    np.array(all_scores)))

        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            fit_params = self.fit_params
            if sample_weight is not None:
                fit_params = fit_params.copy()
                fit_params['sample_weight'] = sample_weight
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self