예제 #1
0
def cross_val_score_fn(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
                       verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation.
    This overrides the cross_val_score method typically found in
    cross_validation.py. Changes are clearly marked in comments, but
    the main change is augmenting the function to store Fit and Metric Events
    for each fold.
    """
    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    scorer = check_scoring(estimator, scoring=scoring)

    # Default scoring scheme is 'accuracy' unless provided by user.
    if scoring is None:
        scoring = 'accuracy'
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    # Change from original scikit code: adding a new argument, scoring, to the
    # _fit_and_score function to track scoring function and create
    # MetricEvents.
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params, scoring)
                      for train, test in cv)
    return np.array(scores)[:, 0]
예제 #2
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score


        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #3
0
def evaluate_estimator(datafile, estimator, task, metric=None, logger=None):
    if metric and metric not in METRIC:
        raise ValueError("Invalid metric")

    def scorer(estimator, X, y):
        if task in REGRESSION_TASKS:
            y_pr = estimator.predict(X)
        elif task in CLASSIFICATION_TASKS:
            y_pr = estimator.predict_proba(X, batch_size=1000)
        else:
            raise NotImplementedError()
        score = _calculate_score(y, y_pr, task, metric)

        return score

    eval_s = time.time()

    data_pkl = joblib.load(datafile, 'r')
    resampling = data_pkl['resampling']
    if resampling == 'holdout':
        X_tr = data_pkl["X"]
        y_tr = data_pkl["y"]
        X_val = data_pkl["valid_X"]
        y_val = data_pkl["valid_y"]
        estimator.fit(X_tr, y_tr)
        score = scorer(estimator, X_val, y_val)
    elif resampling == 'cv':
        X, y = data_pkl["X"], data_pkl["y"]
        cv = cross_validation.check_cv(None,
                                       X,
                                       y,
                                       classifier=(task
                                                   in CLASSIFICATION_TASKS))

        score = defaultdict(list) if metric is None else []
        for train, test in cv:
            X_tr, X_val = X[train], X[test]
            y_tr, y_val = y[train], y[test]
            estimator.fit(X_tr, y_tr)
            score_ = scorer(estimator, X_val, y_val)
            if metric is None:
                for m in score_:
                    score[m].append(score_[m])
            else:
                score.append(score_)
        if metric is None:
            for m in score:
                score[m] = np.mean(score[m])
        else:
            score = np.mean(score)
        estimator.fit(X, y)
    else:
        raise NotImplementedError()

    eval_e = time.time()
    if logger:
        logger.debug("Evaluation done, score: %s | %s sec\n%s" %
                     (score, eval_e - eval_s, estimator))

    return score
예제 #4
0
    def fit(self, X, y):
        if self.n_neighbors_try is None:
            n_neighbors_try = range(1, 6)
        else:
            n_neighbors_try = self.n_neighbors_try

        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)

        cv = check_cv(self.cv, X, y)
        knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')
        scorer = check_scoring(knn, scoring=self.scoring)

        scores = []
        for train_ix, test_ix in cv:
            dist = self._pairwise_wmd(X[test_ix], X[train_ix])
            knn.fit(X[train_ix], y[train_ix])
            scores.append([
                scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])
                for k in n_neighbors_try
            ])
        scores = np.array(scores)
        self.cv_scores_ = scores

        best_k_ix = np.argmax(np.mean(scores, axis=0))
        best_k = n_neighbors_try[best_k_ix]
        self.n_neighbors = self.n_neighbors_ = best_k

        return super(WordMoversKNNCV, self).fit(X, y)
예제 #5
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(
                    X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #6
0
def Bootstrap_cv(estimator1, estimator2, X, y, score_func, cv=None, n_jobs=1,
                 verbose=0, ratio=.5):
    X, y = cross_validation.check_arrays(X, y, sparse_format='csr')
    cv = cross_validation.check_cv(cv, X, y,
                                   classifier=
                                   cross_validation.is_classifier(estimator1))
    if score_func is None:
        if not hasattr(estimator1, 'score') or \
                not hasattr(estimator2, 'score'):
            raise TypeError(
                "If no score_func is specified, the estimator passed "
                "should have a 'score' method. The estimator %s "
                "does not." % estimator1)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    scores = \
        cross_validation.Parallel(
            n_jobs=n_jobs, verbose=verbose)(
                cross_validation.delayed(
                    dual_cross_val_score)
                (cross_validation.clone(estimator1),
                 cross_validation.clone(estimator2),
                 X, y, score_func, train, test, verbose, ratio)
                for train, test in cv)
    return np.array(scores)
예제 #7
0
파일: base.py 프로젝트: kingjr/jr-tools
    def fit(self, epochs, y=None):
        from sklearn.cross_validation import check_cv, StratifiedKFold
        from mne.decoding.time_gen import _check_epochs_input
        X, y, self.gat.picks_ = _check_epochs_input(epochs, y, self.gat.picks)
        gat_list = list()

        cv = self.cv
        if isinstance(cv, (int, np.int)):
            cv = StratifiedKFold(y, cv)
        cv = check_cv(cv, X, y, classifier=True)
        # Construct meta epoch and fit gat with a single fold
        for ii, (train, test) in enumerate(cv):
            # meta trial
            epochs_ = make_meta_epochs(epochs[train], y[train], n_bin=self.n)
            # fit gat
            gat_ = deepcopy(self.gat)
            cv_one_fold = [(range(len(epochs_)), [])]
            gat_.cv = cv_one_fold
            gat_.fit(epochs_, epochs_.events[:, 2])
            gat_list.append(gat_)

        # gather
        self.gat = gat_
        self.gat.train_times_ = gat_.train_times_
        self.gat.estimators_ = np.squeeze(
            [gat.estimators_ for gat in gat_list]).T.tolist()
        self.gat.cv_ = cv
        self.gat.y_train_ = y
예제 #8
0
    def fit(self, epochs, y=None):
        from sklearn.cross_validation import check_cv, StratifiedKFold
        from mne.decoding.time_gen import _check_epochs_input
        X, y, self.gat.picks_ = _check_epochs_input(epochs, y, self.gat.picks)
        gat_list = list()

        cv = self.cv
        if isinstance(cv, (int, np.int)):
            cv = StratifiedKFold(y, cv)
        cv = check_cv(cv, X, y, classifier=True)
        # Construct meta epoch and fit gat with a single fold
        for ii, (train, test) in enumerate(cv):
            # meta trial
            epochs_ = make_meta_epochs(epochs[train], y[train], n_bin=self.n)
            # fit gat
            gat_ = deepcopy(self.gat)
            cv_one_fold = [(range(len(epochs_)), [])]
            gat_.cv = cv_one_fold
            gat_.fit(epochs_, epochs_.events[:, 2])
            gat_list.append(gat_)

        # gather
        self.gat = gat_
        self.gat.train_times_ = gat_.train_times_
        self.gat.estimators_ = np.squeeze(
            [gat.estimators_ for gat in gat_list]).T.tolist()
        self.gat.cv_ = cv
        self.gat.y_train_ = y
예제 #9
0
def benchmark(clf, X, y, cv=None):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(clf))
    
    # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv)
    
    train_times = []
    test_times = []
    confusion_matrices = []
    confusion_matrix_indices = []
    coefs = []
    for train, test in cv:
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        
        t0 = time()
        clf.fit(X_train, y_train)
        train_times.append(time()-t0)
        
        t0 = time()
        y_pred = clf.predict(X_test)
        test_times.append(time()-t0)
    
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)]))
    
        coefs.append(clf.coef_)
    
    return dict(
        train_times = np.array(train_times),
        test_times = np.array(test_times),
        confusion_matrices = np.array(confusion_matrices),
        confusion_matrix_indices = np.array(confusion_matrix_indices),
        coefs = np.array(coefs)
    )
예제 #10
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    score_func=None,
                    cv=None,
                    n_jobs=-1,
                    verbose=0,
                    as_dvalues=False):
    """Evaluate a score by cross-validation.

  Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to
  support computation of decision values.

  """
    X, y = check_arrays(X, y, sparse_format='csr')
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    if score_func is None:
        if not hasattr(estimator, 'score'):
            raise TypeError(
                "If no score_func is specified, the estimator passed "
                "should have a 'score' method. The estimator %s "
                "does not." % estimator)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_cross_val_score)(clone(estimator), X, y, score_func, train,
                                  test, verbose, as_dvalues)
        for train, test in cv)
    return np.array(scores)
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        param_grid = ParameterSampler(self.param_distributions,
                                      self.n_iter,
                                      random_state=self.random_state)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(param_grid, Sized):
                n_candidates = len(param_grid)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid,
                                             len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_

        indexed_output = dict(
            par_param_grid.map(
                lambda i: local_fit(i[0], i[1], base_estimator, X_bc.value,
                                    y_bc.value, scorer, cv)).collect())
        out = [indexed_output[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        best = sorted(out, key=lambda x: x[0], reverse=True)[0]

        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #12
0
    def _grid_search(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator))

        master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params)
        return master.run(train_X, train_y)
예제 #13
0
def add_del_cv(df, predictors, target, model, scoring='roc_auc', cv1=None,
               n_folds=8, n_jobs=-1, start=[], selmax=None, selmin=1,
               min_ratio=1e-7, max_steps=10, verbosity=0):
    """ Forward-Backward (ADD-DEL) selection using model.

    Parameters
    ----------

    Returns
    -------
    selected: list
        selected predictors

    Example
    -------
    References
    ----------
    """
    def test_to_break(selected, selected_curr, to_break):
        if set(selected) == set(selected_curr):
            to_break += 1
        else:
            to_break = 0
        return to_break

    X, y, _ = df_xyf(df, predictors=predictors, target=target)
    cv1 = cross_validation.check_cv(
            cv1, X=X, y=y,
            classifier=is_classifier(model))

    selected_curr = start
    to_break = 0

    for i_step in xrange(max_steps):
        selected = forward_cv(
                        df, predictors, target, model, scoring=scoring,
                        cv1=cv1, n_folds=n_folds, n_jobs=n_jobs,
                        start=selected_curr, selmax=selmax,
                        min_ratio=min_ratio, verbosity=verbosity-1)
        to_break = test_to_break(selected, selected_curr, to_break)
        selected_curr = selected
        if verbosity > 0:
            print('forward:', ' '.join(selected_curr))
        if to_break > 1:
            break
        selected = backward_cv(
                        df, selected_curr, target, model, scoring=scoring,
                        cv1=cv1, n_folds=n_folds, n_jobs=n_jobs, selmin=selmin,
                        min_ratio=min_ratio, verbosity=verbosity-1)
        to_break = test_to_break(selected, selected_curr, to_break)
        selected_curr = selected
        if verbosity > 0:
            print('backward:', ' '.join(selected_curr))
        if to_break > 0:
            break

    return selected_curr
예제 #14
0
    def score(self, test_parameter):
        """
		The score function to call in order to evaluate the quality 
		of the parameter test_parameter

		Parameters
		----------
		tested_parameter : dict, the parameter to test

		Returns
		-------
		score : the CV score, either the list of all cv results or
			the mean (depending of score_format)
		"""

        if not self._callable_estimator:
            cv = check_cv(self.cv,
                          self.X,
                          self.y,
                          classifier=is_classifier(self.estimator))
            cv_score = [
                _fit_and_score(clone(self.estimator),
                               self.X,
                               self.y,
                               self.scorer_,
                               train,
                               test,
                               False,
                               test_parameter,
                               self.fit_params,
                               return_parameters=True) for train, test in cv
            ]

            n_test_samples = 0
            mean_score = 0
            detailed_score = []
            for tmp_score, tmp_n_test_samples, _, _ in cv_score:
                detailed_score.append(tmp_score)
                tmp_score *= tmp_n_test_samples
                n_test_samples += tmp_n_test_samples
                mean_score += tmp_score
            mean_score /= float(n_test_samples)

            if (self.score_format == 'avg'):
                score = mean_score
            else:  # format == 'cv'
                score = detailed_score

        else:
            if (self.score_format == 'avg'):
                score = [self.estimator(test_parameter)]
            else:  # format == 'cv'
                score = self.estimator(test_parameter)

        return score
예제 #15
0
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None,
                              verbose=0, fit_params=None):


    print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list


    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X = tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)
    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))

    preds_blocks = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten


        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params))

        cross_val_step+=1

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
예제 #16
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
예제 #17
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
예제 #18
0
    def fit(self, X, y=None, sample_weight=None, exposure=None):
        # For later
        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                        pre_dispatch=self.pre_dispatch,
                        max_nbytes=None)
        
        # Extract arguments
        fit_args = self._process_args(X=X, y=y, sample_weight=sample_weight,
                                      exposure=exposure)
        
        # Sort out cv parameters
        if self.cv == 1:
            cv = no_cv(X=X, y=y)
        else:
            if hasattr(self.cv, 'split'):
                cv_args = dict(X=X)
                if y is not None:
                    cv_args['y'] = np.ravel(y)
                cv = self.cv.split(**cv_args)
            else:
                cv_args = dict(X=X)
                if y is not None:
                    cv_args['y'] = shrinkd(1,np.asarray(y))
                cv = check_cv(self.cv, classifier=is_classifier(self.estimator), **cv_args)
                
        # Do the cross validation fits
#         print(valmap(lambda x: x.shape, fit_args))
#         print('num_folds = %d' % self.cv.get_n_splits(X=X))
        cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test, self.verbose) for train, test in cv)
        
        # Combine predictions from cv fits
        prediction = np.empty_like(y) if y is not None else np.empty(shape=X.shape[0])
        for fit in cv_fits:
            safe_assign_subset(prediction, fit[2], fit[1])
        
        # Store cross validation models
        self.cv_estimators_ = [fit[0] for fit in cv_fits]
        self.cv_indices_ = [fit[2] for fit in cv_fits]
        self.cv_predictions_ = prediction
        
        # If a metric was provided, compute the score
        if self.metric is not None:
            metric_args = {}
            if 'sample_weight' in fit_args:
                metric_args['sample_weight'] = fit_args['sample_weight']
            if 'exposure' in fit_args:
                metric_args['exposure'] = fit_args['exposure']
            self.score_ = safer_call(self.metric, y, self.cv_predictions_, **metric_args)
        
        # Fit on entire data set
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(**fit_args)
        return self
예제 #19
0
def test_searchlight():
    # Create a toy dataset to run searchlight on

    # Initialize with 4x4x4 scans of random values on 30 frames
    rand = np.random.RandomState(0)
    frames = 30
    data = rand.rand(5, 5, 5, frames)
    mask = np.ones((5, 5, 5), np.bool)
    mask_img = nibabel.Nifti1Image(mask.astype(np.int), np.eye(4))
    # Create a condition array
    cond = np.arange(frames, dtype=int) > frames // 2

    # Create an activation pixel.
    data[2, 2, 2, :] = 0
    data[2, 2, 2][cond.astype(np.bool)] = 2
    data_img = nibabel.Nifti1Image(data, np.eye(4))


    # Define cross validation
    from sklearn.cross_validation import check_cv
    # avoid using KFold for compatibility with sklearn 0.10-0.13
    cv = check_cv(4, cond)
    n_jobs = 1

    # Run Searchlight with different radii
    # Small radius : only one pixel is selected
    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img,
                                 radius=0.5, n_jobs=n_jobs,
                                 scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 1)
    assert_equal(sl.scores_[2, 2, 2], 1.)

    # Medium radius : little ball selected

    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=1,
                                 n_jobs=n_jobs, scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 7)
    assert_equal(sl.scores_[2, 2, 2], 1.)
    assert_equal(sl.scores_[1, 2, 2], 1.)
    assert_equal(sl.scores_[2, 1, 2], 1.)
    assert_equal(sl.scores_[2, 2, 1], 1.)
    assert_equal(sl.scores_[3, 2, 2], 1.)
    assert_equal(sl.scores_[2, 3, 2], 1.)
    assert_equal(sl.scores_[2, 2, 3], 1.)

    # Big radius : big ball selected
    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=2,
                                 n_jobs=n_jobs, scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 33)
    assert_equal(sl.scores_[2, 2, 2], 1.)
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval.check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval.check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval.check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]]
    cv = cval.check_cv(3, X, y_multilabel, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval.check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
예제 #21
0
def test_searchlight():
    # Create a toy dataset to run searchlight on

    # Initialize with 4x4x4 scans of random values on 30 frames
    rand = np.random.RandomState(0)
    frames = 30
    data = rand.rand(5, 5, 5, frames)
    mask = np.ones((5, 5, 5), np.bool)
    mask_img = nibabel.Nifti1Image(mask.astype(np.int), np.eye(4))
    # Create a condition array
    cond = np.arange(frames, dtype=int) > frames / 2

    # Create an activation pixel.
    data[2, 2, 2, :] = 0
    data[2, 2, 2][cond.astype(np.bool)] = 2
    data_img = nibabel.Nifti1Image(data, np.eye(4))


    # Define cross validation
    from sklearn.cross_validation import check_cv
    # avoid using KFold for compatibility with sklearn 0.10-0.13
    cv = check_cv(4, cond)
    n_jobs = 1

    # Run Searchlight with different radii
    # Small radius : only one pixel is selected
    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img,
                                 radius=0.5, n_jobs=n_jobs,
                                 scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 1)
    assert_equal(sl.scores_[2, 2, 2], 1.)

    # Medium radius : little ball selected

    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=1,
                                 n_jobs=n_jobs, scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 7)
    assert_equal(sl.scores_[2, 2, 2], 1.)
    assert_equal(sl.scores_[1, 2, 2], 1.)
    assert_equal(sl.scores_[2, 1, 2], 1.)
    assert_equal(sl.scores_[2, 2, 1], 1.)
    assert_equal(sl.scores_[3, 2, 2], 1.)
    assert_equal(sl.scores_[2, 3, 2], 1.)
    assert_equal(sl.scores_[2, 2, 3], 1.)

    # Big radius : big ball selected
    sl = searchlight.SearchLight(mask_img, process_mask_img=mask_img, radius=2,
                                 n_jobs=n_jobs, scoring='accuracy', cv=cv)
    sl.fit(data_img, cond)
    assert_equal(np.where(sl.scores_ == 1)[0].size, 33)
    assert_equal(sl.scores_[2, 2, 2], 1.)
예제 #22
0
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval.check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval.check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval.check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]]
    cv = cval.check_cv(3, X, y_multilabel, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval.check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
예제 #23
0
def _score_lambda_path(est, X, y, sample_weight, cv, scoring, classifier,
                       n_jobs, verbose):
    """Score each model found by glmnet using cross validation.

    Parameters
    ----------
    est : estimator
        The previously fitted estimator.

    X : array, shape (n_samples, n_features)
        Input features

    y : array, shape (n_samples,)
        Target values.

    sample_weight : array, shape (n_samples,)
        Weight of each row in X.

    n_folds : int
        Number of folds for cross validation, must be at least 3.

    scoring : string, callable or None
        Scoring method to apply to each model.

    n_jobs: int
        Maximum number of threads to use for scoring models.

    verbose : bool
        Emit logging data and warnings when True.

    Returns
    -------
    scores : array, shape (n_lambda,)
        Scores for each value of lambda over all cv folds.
    """
    scorer = check_scoring(est, scoring)
    cv = check_cv(cv, X, y, classifier)

    # We score the model for every value of lambda, for classification
    # models, this will be an intercept-only model, meaning it predicts
    # the same class regardless of the input. Obviously, this makes some of
    # the scikit-learn metrics unhappy, so we are silencing these warnings.
    # Also note, catch_warnings is not thread safe.
    with warnings.catch_warnings():
        action = 'always' if verbose else 'ignore'
        warnings.simplefilter(action, UndefinedMetricWarning)

        scores = Parallel(n_jobs=n_jobs, verbose=verbose, backend='threading')(
            delayed(_fit_and_score)(est, scorer, X, y, sample_weight,
                                    est.lambda_path_, train_idx, test_idx)
            for (train_idx, test_idx) in cv)

    return scores
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval.check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval.check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval.check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval.check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval.check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval.check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval.check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval.check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval.check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval.check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval.check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval.check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
예제 #26
0
def my_cross_val_predict(estimator,
                         X,
                         y=None,
                         groups=None,
                         cv=None,
                         n_jobs=1,
                         verbose=0,
                         fit_params=None,
                         pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'.format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(
        delayed(_my_fit_and_predict)(clone(estimator), X, y, train, test,
                                     verbose, fit_params, method)
        for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate(
        [indices_i for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    out_predictions = predictions[inv_test_indices]
    return out_predictions.reshape(y.shape), scores
예제 #27
0
    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(k,
                                       self.feature_vector,
                                       self.classification_vector,
                                       classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count
                        and negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count
                      and neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count
                      and positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count
                      and negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))
예제 #28
0
    def fit(self, X, y=None, sample_weight=None, exposure=None):
        if self.cv == 1:
            cv = no_cv(X=X, y=y)
        else:
            if hasattr(self.cv, 'split'):
                cv = self.cv.split(X, y)
            else:
                cv = check_cv(self.cv, X=X, y=y, classifier=is_classifier(self.calibrator))
        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                        pre_dispatch=self.pre_dispatch)
        
        # Fit the estimator on each train set and predict with it on each test set
        fit_args = {'X': X}
        if y is not None:
            fit_args['y'] = y
        if self.est_weight and sample_weight is not None:
            fit_args['sample_weight'] = sample_weight
        if self.est_exposure and exposure is not None:
            fit_args['exposure'] = exposure
        
        # Do the cross validation fits
        cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test) for train, test in cv)
        
        # Combine predictions from cv fits
        prediction = np.empty_like(y)
        for fit in cv_fits:
            safe_assign_subset(prediction, fit[2], fit[1])
        
#         fit_predict_results = parallel(delayed(_fit_and_predict)(estimator=clone(self.estimator),
#                                        train=train, test=test, **fit_args) for train, test in cv)
#         
#         # Combine the predictions
#         prediction = np.empty_like(y)
#         for _, pred, _, test in zip(fit_predict_results, cv):
            
#         prediction = np.concatenate([pred for _, pred in cv_fits], axis=0)
        
        # Fit the calibrator on the predictions
        cal_args = {'X': prediction[:, None] if len(prediction.shape) == 1 else prediction, 
                    'y': y}
        if self.cal_weight and sample_weight is not None:
            cal_args['sample_weight'] = sample_weight
        if self.cal_exposure and exposure is not None:
            cal_args['exposure'] = exposure
        self.calibrator_ = clone(self.calibrator).fit(**cal_args)
        
        # Fit the estimator on the entire data set
        self.estimator_ = clone(self.estimator).fit(**fit_args)
        
        return self
예제 #29
0
    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(
            k, self.feature_vector, self.classification_vector,
            classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                          self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count and
                    negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count and
                    neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count and
                    positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count and
                    negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))
예제 #30
0
    def fit(self, X, y):
        if master:
            LOG.info("comm_size:" + str(comm_size))
        X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr')
        _check_param_grid(self.param_grid)

        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        if master:
            LOG.info("cv length:" + str(len(cv)))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if master:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()
        return self
예제 #31
0
	def score(self,test_parameter):
		"""
		The score function to call in order to evaluate the quality 
		of the parameter test_parameter

		Parameters
		----------
		`tested_parameter` : dict, the parameter to test

		Returns
		-------
		`score` : the CV score, either the list of all cv results or
			the mean (depending of score_format)
		"""

		if not self._callable_estimator:
	 		cv = check_cv(self.cv, self.X, self.y, classifier=is_classifier(self.estimator))
	 		cv_score = [ _fit_and_score(clone(self.estimator), self.X, self.y, self.scorer_,
							train, test, False, test_parameter,
							self.fit_params, return_parameters=True)
						for train, test in cv ]

			n_test_samples = 0
			mean_score = 0
			detailed_score = []
			for tmp_score, tmp_n_test_samples, _, _ in cv_score:
				detailed_score.append(tmp_score)
				tmp_score *= tmp_n_test_samples
				n_test_samples += tmp_n_test_samples
				mean_score += tmp_score
			mean_score /= float(n_test_samples)

			if(self.score_format == 'avg'):
				score = mean_score
			else: # format == 'cv'
				score = detailed_score


		else:
			if(self.score_format == 'avg'):
				score = [self.estimator(test_parameter)]
			else: # format == 'cv'
				score = self.estimator(test_parameter)

		return score
예제 #32
0
    def fit(self, X, y):
        """Fit KNN model by choosing the best `n_neighbors`.

        Parameters
        -----------
        X : scipy.sparse matrix, (n_samples, vocab_size)
            Data
        y : ndarray, shape (n_samples,) or (n_samples, n_targets)
            Target
        """
        if self.n_neighbors is None:
            n_neighbors = 2
        else:
            n_neighbors = self.n_neighbors

        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)

        cv = check_cv(self.cv, X, y)
        # knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')
        knn = KNeighborsClassifier(n_jobs=self.n_jobs)
        scorer = check_scoring(knn, scoring=self.scoring)

        scores = []
        cycle = 1
        for train_ix, test_ix in cv:
            dist = self._pairwise_wmd(X[test_ix], X[train_ix])
            knn.fit(X[train_ix], y[train_ix])
            # scores.append([
            #                   scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])
            #                   for k in n_neighbors_try
            #                   ])
            scores.append(
                scorer(knn.set_params(n_neighbors=n_neighbors), dist,
                       y[test_ix]))
            logger.info("%i/%i folds done!", cycle, self.cv)
            cycle += 1
        scores = np.array(scores)
        self.cv_scores_ = scores

        # best_k_ix = np.argmax(np.mean(scores, axis=0))
        # best_k = n_neighbors_try[best_k_ix]
        # self.n_neighbors = self.n_neighbors_ = best_k

        return super(WordMoversKNNCV, self).fit(X, y)
예제 #33
0
파일: stacking.py 프로젝트: luoq/datatrek
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    pred = Parallel(n_jobs=n_jobs)(
        delayed(_cross_val_predict)(
            clone(estimator), X, y, train, test, predict_fun)
        for train, test in cv)
    pred = np.concatenate(pred)
    if cv.indices:
        index = np.concatenate([test for _, test in cv])
    else:
        index = np.concatenate([np.where(test)[0] for _, test in cv])
    ## pred[index] = pred doesn't work as expected
    pred[index] = pred.copy()
    if refit:
        return pred, clone(estimator).fit(X,y)
    else:
        return pred
예제 #34
0
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None,
                verbose=0, fit_params=None):

    print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list
    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X= tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)

    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))
    scorer = cross_validation.check_scoring(estimator, scoring=scoring)
    scores = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten



        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer,
                        train, test, verbose, None, fit_params))

        cross_val_step += 1


    return np.array(scores)[:, 0]
예제 #35
0
파일: ifs.py 프로젝트: teopir/ifqi
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
                         verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'
                             .format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_my_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
                                 for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate([indices_i
                                   for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    return predictions[inv_test_indices], scores
예제 #36
0
	def fit(self,X,Y):
		if not self.best_subset:
			self.fshape = np.shape(X)[1]
			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

			self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator))

			self.best_subset = tuple()
			self.best_subset_score = 0
			self.scores_ = {self.best_subset:self.best_subset_score}
			X = np.array(X)
			Y = np.array(Y)


			try:
				self.get_best_subset(X,Y)
			except KeyboardInterrupt:
				pass
		self.estimator = self.estimator.fit(X[:,self.best_subset],Y)
		return self
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1,
                    verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):

    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(model))
    scorer = check_scoring(model, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    #
    scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in cv)

    return np.array(scores)[:, 0]
예제 #38
0
def check_cv(cv, X=None, y=None, classifier=False):
    """Input checker utility for building a CV in a user friendly way.

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - An object to be used as a cross-validation generator.
        - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True, ``X`` and ``y`` aren't
        dask collections,  and ``y`` is binary or multiclass,
        ``StratifiedKFold`` used. In all other cases, ``KFold`` is used.

    X : array-like
        The data the cross-val object will be applied on.

    y : array-like
        The target variable for a supervised learning problem.

    classifier : boolean optional
        Whether the task is a classification task.
    """
    if is_dask_collection(X) or is_dask_collection(y):
        if cv is None:
            return KFold(n_folds=3)
        elif isinstance(cv, Integral):
            return KFold(n_folds=cv)
        elif not isinstance(cv, DaskBaseCV):
            raise TypeError("Unexpected cv type {0}".format(type(cv).__name__))
        else:
            return cv
    if isinstance(cv, DaskBaseCV) and not isinstance(cv, _DaskCVWrapper):
        raise ValueError("Can't use dask cv object with non-dask X and y")
    cv = cross_validation.check_cv(cv, X=X, y=y, classifier=classifier)
    return _DaskCVWrapper(cv)
예제 #39
0
def cross_val_predict(estimator,
                      X,
                      y,
                      cv=5,
                      n_jobs=1,
                      refit=False,
                      predict_fun="predict"):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    pred = Parallel(n_jobs=n_jobs)(delayed(_cross_val_predict)(clone(
        estimator), X, y, train, test, predict_fun) for train, test in cv)
    pred = np.concatenate(pred)
    if cv.indices:
        index = np.concatenate([test for _, test in cv])
    else:
        index = np.concatenate([np.where(test)[0] for _, test in cv])
    ## pred[index] = pred doesn't work as expected
    pred[index] = pred.copy()
    if refit:
        return pred, clone(estimator).fit(X, y)
    else:
        return pred
예제 #40
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)

        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
        if not self.dataset_filenames:
            self.save_dataset_filename(X, y, cv)

        dataset_filenames = self.dataset_filenames

        client = Client()
        lb_view = client.load_balanced_view()

        if self.verbose > 0:
            print("Number of CPU core %d" % len(client.ids()))

        self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
                        for dataset_filename in dataset_filenames], params)
                            for params in parameter_iterable]
        if self.sync:
            self.wait()
            self.set_grid_scores()
            self.set_best_score_params()

            if self.refit:
                self.set_best_estimator(estimator)
        return self
예제 #41
0
    def fit(self, X, y):
        """Fit KNN model by choosing the best `n_neighbors`.

        Parameters
        -----------
        X : scipy.sparse matrix, (n_samples, vocab_size)
            Data
        y : ndarray, shape (n_samples,) or (n_samples, n_targets)
            Target
        """
        if self.n_neighbors_try is None:
            n_neighbors_try = range(1, 6)
        else:
            n_neighbors_try = self.n_neighbors_try

        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)

        cv = check_cv(self.cv, X, y)
        knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')
        scorer = check_scoring(knn, scoring=self.scoring)

        scores = []
        for train_ix, test_ix in cv:
            dist = self._pairwise_wmd(X[test_ix], X[train_ix])
            knn.fit(X[train_ix], y[train_ix])
            scores.append([
                              scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])
                              for k in n_neighbors_try
                              ])
        scores = np.array(scores)
        self.cv_scores_ = scores

        best_k_ix = np.argmax(np.mean(scores, axis=0))
        best_k = n_neighbors_try[best_k_ix]
        self.n_neighbors = self.n_neighbors_ = best_k

        return super(WordMoversKNNCV, self).fit(X, y)
예제 #42
0
def cross_val_score(estimator, X, y=None, score_func=None, cv=None, n_jobs=-1,
    verbose=0, as_dvalues=False):
  """Evaluate a score by cross-validation.

  Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to
  support computation of decision values.

  """
  X, y = check_arrays(X, y, sparse_format='csr')
  cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
  if score_func is None:
      if not hasattr(estimator, 'score'):
          raise TypeError(
              "If no score_func is specified, the estimator passed "
              "should have a 'score' method. The estimator %s "
              "does not." % estimator)
  # We clone the estimator to make sure that all the folds are
  # independent, and that it is pickle-able.
  scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
      delayed(_cross_val_score)(clone(estimator), X, y, score_func, train, test,
          verbose, as_dvalues)
      for train, test in cv)
  return np.array(scores)
예제 #43
0
    def fit(self, X, y=None, sample_weight=None):
        """Run fit with all sets of parameters

        Returns the best classifier

        Parameters
        ----------

        X: array, [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y: array-like, shape = [n_samples], optional
            Target vector relative to X for classification;
            None for unsupervised learning.

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights
        """
        estimator = self.estimator
        cv = self.cv

        if hasattr(X, 'shape'):
            n_samples = X.shape[0]
        else:
            # support list of unstructured objects on which feature
            # extraction will be applied later in the tranformer chain
            n_samples = len(X)
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
        base_clf = clone(self.estimator)

        # first fit at each grid point using the maximum n_estimators
        param_grid = self.param_grid
        param_grid['n_estimators'] = [self.max_n_estimators]
        grid = ParameterGrid(param_grid)

        pre_dispatch = self.pre_dispatch
        clfs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch)(
            delayed(fit_grid_point)(
                X, y, sample_weight, base_clf, clf_params, train, test,
                self.verbose, **self.fit_params)
                    for clf_params in grid
                    for train, test in cv)

        # now use the already fitted ensembles but trancate to N estimators for
        # N from 1 to n_estimators_max - 1 (inclusive)
        out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch)(
            delayed(score_each_boost)(
                X, y, sample_weight,
                clf, clf_params,
                self.min_n_estimators,
                train, test,
                self.loss_func, self.score_func, self.verbose)
                    for clf, clf_params, train, test in clfs)

        out = reduce(operator.add, [zip(*stage) for stage in out])
        # out is now a list of triplet: score, estimator_params, n_test_samples

        n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1
        n_grid_points = len(list(grid)) * n_estimators_points
        n_fits = len(out)
        n_folds = n_fits // n_grid_points

        scores = list()
        cv_scores = list()
        for block in range(0, n_fits, n_folds * n_estimators_points):
            for grid_start in range(block, block + n_estimators_points):
                n_test_samples = 0
                score = 0
                these_points = list()
                for this_score, clf_params, this_n_test_samples in \
                        out[grid_start:
                            grid_start + n_folds * n_estimators_points:
                            n_estimators_points]:
                    these_points.append(this_score)
                    if self.iid:
                        this_score *= this_n_test_samples
                    score += this_score
                    n_test_samples += this_n_test_samples
                if self.iid:
                    score /= float(n_test_samples)
                scores.append((score, clf_params))
                cv_scores.append(these_points)

        cv_scores = np.asarray(cv_scores)

        # Note: we do not use max(out) to make ties deterministic even if
        # comparison on estimator instances is not deterministic
        best_score = -np.inf
        for score, params in scores:
            if score > best_score:
                best_score = score
                best_params = params

        if best_score is None:
            raise ValueError('Best score could not be found')
        self.best_score_ = best_score
        self.best_params_ = best_params

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_clf).set_params(**best_params)
            if sample_weight is not None:
                best_estimator.fit(X, y, sample_weight, **self.fit_params)
            else:
                best_estimator.fit(X, y, **self.fit_params)
            self.best_estimator_ = best_estimator

        # Store the computed scores
        # XXX: the name is too specific, it shouldn't have
        # 'grid' in it. Also, we should be retrieving/storing variance
        self.grid_scores_ = [
            (clf_params, score, all_scores)
                    for (score, clf_params), all_scores
                    in zip(scores, cv_scores)]
        return self
예제 #44
0
    def _fit(self, X, y, parameter_iterable=None):
        if parameter_iterable is not None:
            raise NotImplementedError('The parameter_iterable argument is not supported.')

        # Actual fitting,  performing the search over parameters.
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        n_folds = len(cv)
        self._create_sigopt_exp(self.sigopt_connection, n_folds)

        # start tracking time to optimize estimator
        opt_start_time = time.time()
        for jk in range(0, self.n_iter, self.n_sug):
            # check for opt timeout, ensuring at least 1 observation
            # TODO : handling failure observations
            if (
                self.opt_timeout is not None and
                time.time() - opt_start_time > self.opt_timeout and
                jk >= 1
            ):
                # break out of loop and refit model with best params so far
                break

            suggestions = []
            jobs = []
            for _ in range(self.n_sug):
                for train, test in cv:
                    suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create()
                    parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json())
                    suggestions.append(suggestion)
                    jobs.append([parameters, train, test])

            if self.verbose > 0:
                print('Evaluating params : ', [job[0] for job in jobs])


            # do CV folds in parallel using joblib
            # returns scores on test set
            obs_timed_out = False
            try:
                par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose,
                              'pre_dispatch': pre_dispatch}
                # add timeout kwarg if version of joblib supports it
                if 'timeout' in getfullargspec(Parallel.__init__).args:
                    par_kwargs['timeout'] = self.cv_timeout
                out = Parallel(
                    **par_kwargs
                )(
                    delayed(_fit_and_score)(clone(base_estimator), X, y,
                                            self.scorer_, train, test,
                                            self.verbose, parameters,
                                            self.fit_params,
                                            return_parameters=True,
                                            error_score=self.error_score)
                        for parameters, train, test in jobs)
            except TimeoutError:
                 obs_timed_out = True

            if not obs_timed_out:
                # grab scores from results
                for sidx, suggestion in enumerate(suggestions):
                    score = out[sidx][0]
                    self.sigopt_connection.experiments(self.experiment.id).observations().create(
                        suggestion=suggestion.id,
                        value=score)
            else:
                # obsevation timed out so report a failure
                self.sigopt_connection.experiments(self.experiment.id).observations().create(
                    suggestion=suggestion.id,
                    failed=True)

        # return best SigOpt assignments so far
        best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data

        if not best_assignments:
            raise RuntimeError(
                'No valid observations found. '
                'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.')

        self.best_params_ = self._convert_sigopt_api_to_sklearn_assignments(best_assignments[0].assignments.to_json())
        self.best_score_ = best_assignments[0].value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #45
0
파일: time_gen.py 프로젝트: jaeilepp/eggie
def time_generalization(epochs_list,
                        clf=None,
                        cv=5,
                        scoring="roc_auc",
                        shuffle=True,
                        random_state=None,
                        n_jobs=1,
                        verbose=None):
    """Fit decoder at each time instant and test at all others

    The function returns the cross-validation scores when the train set
    is from one time instant and the test from all others.

    The decoding will be done using all available data channels, but
    will only work if 1 type of channel is availalble. For example
    epochs should contain only gradiometers.

    Parameters
    ----------
    epochs_list : list of Epochs
        The epochs in all the conditions.
    clf : object | None
        A object following scikit-learn estimator API (fit & predict).
        If None the classifier will be a linear SVM (C=1.) after
        feature standardization.
    cv : integer or cross-validation generator, optional
        If an integer is passed, it is the number of fold (default 5).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects.
    scoring : {string, callable, None}, optional, default: "roc_auc"
        A string (see model evaluation documentation in scikit-learn) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    shuffle : bool
        If True, shuffle the epochs before splitting them in folds.
    random_state : None | int
        The random state used to shuffle the epochs. Ignored if
        shuffle is False.
    n_jobs : int
        Number of jobs to eggie in parallel. Each fold is fit
        in parallel.

    Returns
    -------
    scores : array, shape (n_times, n_times)
        The scores averaged across folds. scores[i, j] contains
        the generalization score when learning at time j and testing
        at time i. The diagonal is the cross-validation score
        at each time-independant instant.

    Notes
    -----
    The function implements the method used in:

    Jean-Remi King, Alexandre Gramfort, Aaron Schurger, Lionel Naccache
    and Stanislas Dehaene, "Two distinct dynamic modes subtend the detection of
    unexpected sounds", PLOS ONE, 2013
    """
    from sklearn.base import clone
    from sklearn.utils import check_random_state
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import check_cv

    if clf is None:
        scaler = StandardScaler()
        svc = SVC(C=1, kernel='linear')
        clf = Pipeline([('scaler', scaler), ('svc', svc)])

    info = epochs_list[0].info
    data_picks = pick_types(info, meg=True, eeg=True, exclude='bads')

    # Make arrays X and y such that :
    # X is 3d with X.shape[0] is the total number of epochs to classify
    # y is filled with integers coding for the class to predict
    # We must have X.shape[0] equal to y.shape[0]
    X = [e.get_data()[:, data_picks, :] for e in epochs_list]
    y = [k * np.ones(len(this_X)) for k, this_X in enumerate(X)]
    X = np.concatenate(X)
    y = np.concatenate(y)

    cv = check_cv(cv, X, y, classifier=True)

    ch_types = set([channel_type(info, idx) for idx in data_picks])
    logger.info('Running time generalization on %s epochs using %s.' %
                (len(X), ch_types.pop()))

    if shuffle:
        rng = check_random_state(random_state)
        order = np.argsort(rng.randn(len(X)))
        X = X[order]
        y = y[order]

    parallel, p_time_gen, _ = parallel_func(_time_gen_one_fold, n_jobs)
    scores = parallel(
        p_time_gen(clone(clf), X, y, train, test, scoring)
        for train, test in cv)
    scores = np.mean(scores, axis=0)
    return scores
예제 #46
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func,
                                                       self.score_func,
                                                       self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(n_jobs=self.n_jobs,
                       verbose=self.verbose,
                       pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                           X, y, base_estimator, parameters, train, test,
                           self.scorer_, self.verbose, **{
                               'sample_weight': balance_weights(y[train])
                           }) for parameters in parameter_iterable
                                                  for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X,
                                   y,
                                   sample_weight=balance_weights(y),
                                   **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #47
0
    def fit(self, X, y=None, x_is_index=False):
        """
        fit creates a task for every pair of folds and combination of hyperparameters in the grid
        it then distributes the tasks to ipyparallel view and waits for completion
        :param X: ndarray of data
        :param y: ndarray of target variables
        :param x_is_index: boolean variable to indicate that X is not the data itself,
            but the index of the data to be used on remote machines.
            Useful when sending the data by network is unfeasible
        """

        if not self.loader:
            self.loader = lambda: (X, y)

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        if x_is_index and self.loader is None:
            raise ValueError('no loader given')

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        train_test_parameters = ((train, test, apply_transforms(parameters, self.transforms)) \
                                 for parameters in parameter_iterable
                                 for train, test in cv)

        length = len(parameter_iterable) * len(cv)
        if self.callback:
            self.callback(len(self.cacher), length)

        if x_is_index:
            X_to_pass = X
            y_to_pass = y if self.loader is None else None
        else:
            if self.loader is not None:
                X_to_pass = None
                y_to_pass = None
            else:
                X_to_pass = X
                y_to_pass = y

        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(GridSearchCVParallel.my_fit_and_score,
                    estimator=base_estimator,
                    X=X_to_pass,
                    y=y_to_pass,
                    fit_params=self.fit_params,
                    scorer=self.scorer_,
                    x_is_index=x_is_index,
                    loader=self.loader,
                    fit_callback=self.fit_callback)

        iterable = itertools.ifilter(lambda (i, ttp): i not in self.cacher,
                                     enumerate(train_test_parameters))

        results_by_params = defaultdict(list)
        for id, result in self.cacher.iteritems():
            score, test_size, time, params = result
            results_by_params[frozenset(map(map_param,
                                            params.iteritems()))].append(score)

        try:
            for index, result in self.mapper(f, iterable):
                self.cacher[index] = result
                score, test_size, time, params = result
                results_by_params[frozenset(map(
                    map_param, params.iteritems()))].append(score)
                if self.callback:
                    best_scores = next(
                        iter(
                            sorted(itertools.ifilter(
                                lambda scores: len(scores) == len(cv),
                                results_by_params.values()),
                                   key=lambda scores: np.mean(scores),
                                   reverse=True)), [0])

                    self.callback(1,
                                  length,
                                  description='%.3f+-%.3f' %
                                  (np.mean(best_scores), np.std(best_scores)))
        except Exception as e:
            print(e)
            e_type, e_value, e_tb = sys.exc_info()
            traceback.print_tb(e_tb)

        # assert len(self.cacher) == length and (np.array(self.cacher.keys()) == np.arange(length)).all()

        # out = self.cacher.values()
        #
        # # Out is a list of triplet: score, estimator, n_test_samples
        # n_fits = len(out)
        # n_folds = len(cv)
        #
        # scores = list()
        # grid_scores = list()
        # for grid_start in range(0, n_fits, n_folds):
        #     n_test_samples = 0
        #     score = 0
        #     all_scores = []
        #     for this_score, this_n_test_samples, _, parameters in \
        #             out[grid_start:grid_start + n_folds]:
        #         all_scores.append(this_score)
        #         if self.iid:
        #             this_score *= this_n_test_samples
        #             n_test_samples += this_n_test_samples
        #         score += this_score
        #     if self.iid:
        #         score /= float(n_test_samples)
        #     else:
        #         score /= float(n_folds)
        #     scores.append((score, parameters))
        #     # TODO: shall we also store the test_fold_sizes?
        #     grid_scores.append(_CVScoreTuple(
        #         parameters,
        #         score,
        #         np.array(all_scores)))

        grid_scores = []
        for set_params, all_scores in results_by_params.iteritems():
            grid_scores.append(
                _CVScoreTuple(dict(set_params), np.mean(all_scores),
                              np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        print(len(grid_scores))

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #48
0
    def fit(self, epochs, y=None):
        """ Train a classifier on each specified time slice.

        Note. This function sets the ``picks_``, ``ch_names``, ``cv_``,
        ``y_train``, ``train_times_`` and ``estimators_`` attributes.

        Parameters
        ----------
        epochs : instance of Epochs
            The epochs.
        y : list or ndarray of int, shape (n_samples,) or None, optional
            To-be-fitted model values. If None, y = epochs.events[:, 2].

        Returns
        -------
        self : GeneralizationAcrossTime
            Returns fitted GeneralizationAcrossTime object.

        Notes
        ------
        If X and y are not C-ordered and contiguous arrays of np.float64 and
        X is not a scipy.sparse.csr_matrix, X and/or y may be copied.

        If X is a dense array, then the other methods will not support sparse
        matrices as input.
        """
        from sklearn.base import clone
        from sklearn.cross_validation import check_cv, StratifiedKFold

        # clean attributes
        for att in ['picks_', 'ch_names', 'y_train_', 'cv_', 'train_times_',
                    'estimators_', 'test_times_', 'y_pred_', 'y_true_',
                    'scores_', 'scorer_']:
            if hasattr(self, att):
                delattr(self, att)

        n_jobs = self.n_jobs
        # Extract data from MNE structure
        X, y, self.picks_ = _check_epochs_input(epochs, y, self.picks)
        self.ch_names = [epochs.ch_names[p] for p in self.picks_]

        cv = self.cv
        if isinstance(cv, (int, np.int)):
            cv = StratifiedKFold(y, cv)
        cv = check_cv(cv, X, y, classifier=True)
        self.cv_ = cv  # update CV

        self.y_train_ = y

        # Cross validation scheme
        # XXX Cross validation should later be transformed into a make_cv, and
        # defined in __init__
        self.train_times_ = copy.deepcopy(self.train_times)
        if 'slices' not in self.train_times_:
            self.train_times_ = _sliding_window(epochs.times, self.train_times)

        # Parallel across training time
        parallel, p_time_gen, n_jobs = parallel_func(_fit_slices, n_jobs)
        n_chunks = min(X.shape[2], n_jobs)
        splits = np.array_split(self.train_times_['slices'], n_chunks)

        def f(x):
            return np.unique(np.concatenate(x))

        out = parallel(p_time_gen(clone(self.clf),
                                  X[..., f(train_slices_chunk)],
                                  y, train_slices_chunk, cv)
                       for train_slices_chunk in splits)
        # Unpack estimators into time slices X folds list of lists.
        self.estimators_ = sum(out, list())
        return self
예제 #49
0
    def fit(self, X, y=None):
        parameter_iterable = ParameterGrid(self.param_grid)
        param_list, first = self._grid_to_simple_list(parameter_iterable)
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)

        n_folds = len(cv)
        grid_scores = list()

        def func(x):
            parameters = self._list_to_grid_point(x, parameter_iterable)

            n_test_samples = 0
            score = 0
            all_scores = []

            for train, test in cv:
                this_score, this_n_test_samples, _, parameters = \
                        _fit_and_score(clone(base_estimator), X, y, self.scorer_,
                                       train, test, self.verbose, parameters,
                                       self.fit_params, return_parameters=True,
                                       error_score=self.error_score)
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score

            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)

            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))

            #print 'In func:', x, score
            return score

        max_evals = 17 if getattr(self, 'max_evals',
                                  None) == None else self.max_evals
        l = COGP(func,
                 maxEvaluations=max_evals,
                 grid=param_list,
                 minimize=False)
        out = l.learn()
        #print 'Out:', out

        self.grid_scores_ = grid_scores

        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #50
0
파일: ifs.py 프로젝트: teopir/ifqi
    def _fit(self, X, y, features_names=None, preload_features=None,
             ranking_th=0.005):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        # Initialization
        n_features = X.shape[1]
        features = np.arange(n_features)

        cv = self.cv
        cv = check_cv(cv, y, classifier=is_classifier(self.estimator))
        if sklearn.__version__ == '0.17':
            n_splits = cv.n_folds
        else:
            n_splits = cv.get_n_splits(X, y)

        if self.verbose > 0:
            print("Fitting {0} folds for each of iteration".format(n_splits))

        if 0.0 < self.n_features_step < 1.0:
            step = int(max(1, self.n_features_step * n_features))
        else:
            step = int(self.n_features_step)
        if step <= 0:
            raise ValueError("Step must be >0")

        if features_names is not None:
            features_names = np.array(features_names)
        else:
            if self.features_names is not None:
                features_names = self.features_names
            else:
                features_names = np.arange(n_features)  # use indices

        tentative_support_ = np.zeros(n_features, dtype=np.bool)
        current_support_ = np.zeros(n_features, dtype=np.bool)

        self.scores_ = []
        self.scores_confidences_ = []
        self.features_per_it_ = []

        if preload_features is not None:
            preload_features = np.unique(preload_features).astype('int')
            current_support_[preload_features] = True

            X_selected = X[:, features[current_support_]]
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected, y, cv=cv)
            target = y - y_hat
        else:
            target = y.copy()

        score, confidence_interval = -np.inf, 0
        proceed = np.sum(current_support_) < X.shape[1]
        while proceed:
            if self.verbose > 0:
                print('\nN-times variance of target: {}'.format(
                    target.var() * target.shape[0]))
            # update values
            old_confidence_interval = confidence_interval
            old_score = score

            if self.scale:
                target = StandardScaler().fit_transform(target.reshape(
                    -1, 1)).ravel()
                # target = MinMaxScaler().fit_transform(target.reshape(
                #     -1,1)).ravel()

            if self.verbose > 0:
                print()
                print('Feature ranking')
                print()
                print("target shape: {}".format(target.shape))
                print()

            # Rank the remaining features
            start_t = time.time()
            rank_estimator = clone(self.estimator)
            rank_estimator.fit(X, target)
            end_fit = time.time() - start_t

            # Get coefs
            start_t = time.time()
            if hasattr(rank_estimator, 'coef_'):
                coefs = rank_estimator.coef_
            elif hasattr(rank_estimator, 'feature_importances_'):
                coefs = rank_estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')
            end_rank = time.time() - start_t

            # Get ranks by ordering in ascending way
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
                coefs = coefs.sum(axis=0)
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            if self.verbose > 0:
                ranked_f = features[ranks]
                if features_names is not None:
                    ranked_n = features_names[ranks]
                else:
                    ranked_n = ['-'] * n_features
                print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score',
                                                    'Feature Name'))
                for i in range(n_features):
                    idx = n_features - i - 1
                    if (coefs[ranks[idx]] < ranking_th) and (i > 2):
                        print(' ...')
                        break
                    print('#{:6}\t{:6}\t{:6f}\t{}'.format(str(i),
                                                          str(ranked_f[idx]),
                                                          coefs[ranks[idx]],
                                                          ranked_n[idx]))
                print(
                    "\n Fit done in {} s and rank done in {} s".format(end_fit,
                                                                       end_rank))

            # if coefs[ranks][-1] < 1e-5:
            #     if self.verbose > 0:
            #         import warnings
            #         warnings.warn('scores are too small to be used, please standardize inputs')
            #     break

            # get the best features (ie, the latest one)
            # if the most ranked features is selected go on a select
            # other features accordingly to the ranking

            # threshold = step
            # step_features = features[ranks][-threshold:]

            ii = len(features_names) - 1
            step_features = features[ranks][ii]
            while np.all(current_support_[step_features]) and ii > 0:
                ii -= 1
                step_features = features[ranks][ii]

            if np.all(current_support_[step_features]):
                if self.verbose > 0:
                    print("Selected features: {} {}".format(
                        features_names[step_features], step_features))
                    # if features_names is not None:
                    #     print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features))
                    # else:
                    #     print("Selected features: {}".format(step_features))
                    print('Ended because selected features already selected')
                step_features = None
                break

            # update selected features
            tentative_support_[step_features] = True

            # get the selected features
            X_selected = X[:, features[tentative_support_]]

            start_t = time.time()
            # cross validates to obtain the scores
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected, y, cv=cv)
            # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv)

            # compute new target
            target = y - y_hat

            # compute score and confidence interval
            # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average')  # np.mean(cv_scores)
            if self.verbose > 0:
                print('r2: {}'.format(np.mean(cv_scores, axis=0)))

            score = np.mean(cv_scores)
            if len(cv_scores.shape) > 1:
                cv_scores = np.mean(cv_scores, axis=1)
            m2 = np.mean(cv_scores * cv_scores)
            confidence_interval_or = np.sqrt(
                (m2 - score * score) / (n_splits - 1))

            end_t = time.time() - start_t

            if self.verbose > 0:
                # if features_names is not None:
                print("Selected features: {} {}".format(
                    features_names[step_features], step_features))
                print("Total features: {} {}".format(
                    features_names[tentative_support_],
                    features[tentative_support_]))
                # else:
                #     print("Selected features: {}".format(step_features))
                #     print("Total features: {}".format(features[tentative_support_]))
                print("R2= {} +- {}".format(score, confidence_interval_or))
                print("\nCrossvalidation done in {} s".format(end_t))

            confidence_interval = confidence_interval_or * self.significance  # do not trust confidence interval completely

            # check terminal condition
            proceed = score - old_score > old_confidence_interval + confidence_interval
            if self.verbose > 0:
                print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format(
                    proceed, score, old_score,
                    old_confidence_interval,
                    confidence_interval,
                    score - old_score,
                    old_confidence_interval + confidence_interval))

            if proceed or np.sum(current_support_) == 0:
                # last feature set proved to be informative
                # we need to take into account of the new features (update current support)
                current_support_[step_features] = True
                self.features_per_it_.append(features_names[step_features])
                self.scores_.append(score)
                self.scores_confidences_.append(confidence_interval)

                # all the features are selected, stop
                if np.sum(current_support_) == n_features:
                    if self.verbose > 0:
                        print("All the features has been selected.")
                    proceed = False
            else:
                # last feature set proved to be not informative
                # keep old support and delete the current one (it is no more necessary)
                del tentative_support_
                if self.verbose > 0:
                    print('Last feature {} not added to the set'.format(
                        features_names[step_features]))

        # Set final attributes
        self.estimator_ = clone(self.estimator)
        # self.estimator_.fit(Xns[:, current_support_], yns)
        self.estimator_.fit(X[:, current_support_], y)

        self.n_features_ = current_support_.sum()
        self.support_ = current_support_
        # self.ranking_ = ranking_

        return self
예제 #51
0
    def _fit(self, X, y, parameter_dict):

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        creator.create("FitnessMax", base.Fitness, weights=(1.0, ))
        creator.create("Individual",
                       list,
                       est=clone(self.estimator),
                       fitness=creator.FitnessMax)

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(
            parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" %
                  (self.gene_type, maxints))

        toolbox.register("individual",
                         _initIndividual,
                         creator.Individual,
                         maxints=maxints)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)

        toolbox.register("evaluate",
                         _evalFunction,
                         name_values=name_values,
                         X=X,
                         y=y,
                         scorer=self.scorer_,
                         cv=cv,
                         iid=self.iid,
                         verbose=self.verbose,
                         error_score=self.error_score,
                         fit_params=self.fit_params)

        toolbox.register("mate",
                         _cxIndividual,
                         indpb=self.gene_crossover_prob,
                         gene_type=self.gene_type)

        toolbox.register("mutate",
                         _mutIndividual,
                         indpb=self.gene_mutation_prob,
                         up=maxints)
        toolbox.register("select",
                         tools.selTournament,
                         tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("min", np.min)
        stats.register("max", np.max)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(
                np.prod(np.array(maxints) + 1)))

        pop, logbook = algorithms.eaSimple(pop,
                                           toolbox,
                                           cxpb=0.5,
                                           mutpb=0.2,
                                           ngen=self.generations_number,
                                           stats=stats,
                                           halloffame=hof,
                                           verbose=self.verbose)

        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)

        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" %
                  (current_best_params_, current_best_score_))

        if current_best_score_ > self.best_score_:
            self.best_score_ = current_best_score_
            self.best_params_ = current_best_params_
예제 #52
0
def time_generalization(epochs_list, clf=None, cv=5, scoring="roc_auc",
                        shuffle=True, random_state=None, n_jobs=1,
                        verbose=None):
    """Fit decoder at each time instant and test at all others

    The function returns the cross-validation scores when the train set
    is from one time instant and the test from all others.

    The decoding will be done using all available data channels, but
    will only work if 1 type of channel is availalble. For example
    epochs should contain only gradiometers.

    Parameters
    ----------
    epochs_list : list of Epochs
        The epochs in all the conditions.
    clf : object | None
        A object following scikit-learn estimator API (fit & predict).
        If None the classifier will be a linear SVM (C=1.) after
        feature standardization.
    cv : integer or cross-validation generator, optional
        If an integer is passed, it is the number of fold (default 5).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects.
    scoring : {string, callable, None}, optional, default: "roc_auc"
        A string (see model evaluation documentation in scikit-learn) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    shuffle : bool
        If True, shuffle the epochs before splitting them in folds.
    random_state : None | int
        The random state used to shuffle the epochs. Ignored if
        shuffle is False.
    n_jobs : int
        Number of jobs to run in parallel. Each fold is fit
        in parallel.

    Returns
    -------
    scores : array, shape (n_times, n_times)
        The scores averaged across folds. scores[i, j] contains
        the generalization score when learning at time j and testing
        at time i. The diagonal is the cross-validation score
        at each time-independant instant.

    Notes
    -----
    The function implements the method used in:

    Jean-Remi King, Alexandre Gramfort, Aaron Schurger, Lionel Naccache
    and Stanislas Dehaene, "Two distinct dynamic modes subtend the detection of
    unexpected sounds", PLOS ONE, 2013
    """
    from sklearn.base import clone
    from sklearn.utils import check_random_state
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import check_cv

    if clf is None:
        scaler = StandardScaler()
        svc = SVC(C=1, kernel='linear')
        clf = Pipeline([('scaler', scaler), ('svc', svc)])

    info = epochs_list[0].info
    data_picks = pick_types(info, meg=True, eeg=True, exclude='bads')

    # Make arrays X and y such that :
    # X is 3d with X.shape[0] is the total number of epochs to classify
    # y is filled with integers coding for the class to predict
    # We must have X.shape[0] equal to y.shape[0]
    X = [e.get_data()[:, data_picks, :] for e in epochs_list]
    y = [k * np.ones(len(this_X)) for k, this_X in enumerate(X)]
    X = np.concatenate(X)
    y = np.concatenate(y)

    cv = check_cv(cv, X, y, classifier=True)

    ch_types = set([channel_type(info, idx) for idx in data_picks])
    logger.info('Running time generalization on %s epochs using %s.' %
                (len(X), ch_types.pop()))

    if shuffle:
        rng = check_random_state(random_state)
        order = np.argsort(rng.randn(len(X)))
        X = X[order]
        y = y[order]

    parallel, p_time_gen, _ = parallel_func(_time_gen_one_fold, n_jobs)
    scores = parallel(p_time_gen(clone(clf), X, y, train, test, scoring)
                      for train, test in cv)
    scores = np.mean(scores, axis=0)
    return scores
예제 #53
0
    def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'):

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)


        # out = Parallel(
        #     n_jobs=self.n_jobs, verbose=self.verbose,
        #     pre_dispatch=pre_dispatch
        # )(
        #     delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
        #                             train, test, self.verbose, parameters,
        #                             self.fit_params, return_parameters=True,
        #                             error_score=self.error_score)
        #         for parameters in parameter_iterable
        #         for train, test in cv)

        train_test_parameters = ((train, test, parameters) \
                                 for parameters in parameter_iterable for train, test in cv)

        length = len(parameter_iterable) * len(cv)

        if x_is_index:
            X_to_pass = X
            y_to_pass = None
        else:
            X_to_pass = None
            y_to_pass = None

        self.view.block = False
        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(my_fit_and_score, estimator=clone(base_estimator),
                    X=X_to_pass,
                    y=y_to_pass,
                    verbose=self.verbose,
                    fit_params=self.fit_params,
                    return_parameters=True,
                    scorer=None,
                    x_is_index=x_is_index,
                    names=(X_name, y_name))

        # print('before map')

        # import cProfile
        #
        # pr = cProfile.Profile()
        # pr.enable()
        chunksize = 10

        out = self.view.map(f, itertools.islice(train_test_parameters, 0, length),
                            ordered=False,
                            block=False,
                            chunksize=chunksize)  # length / len(self.view))
        # pr.disable()
        # pr.print_stats('cumulative')
        print('map called')
        if self.callback is not None:
            old_progress = out.progress
            while not out.ready():
                self.callback(out.progress * chunksize, length, out.elapsed)
                if old_progress == out.progress and out.progress > 0:
                    for id, info in self.view.queue_status(verbose=True).iteritems():
                        # print(id, info)
                        if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0:
                            print(id, info['queue'])

                    pass
                old_progress = out.progress
                sleep(10)
        print('map ready')
        out = out.get()


        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #54
0
    def fit(self, X, y):
        """Fit the learner

        Parameters
        ----------
        X : list of Niimg-like objects
            See http://nilearn.github.io/manipulating_visualizing/manipulating_images.html#niimg
            Data on which model is to be fitted. If this is a list,
            the affine is considered the same for all.

        y : array or list of length n_samples
            The dependent variable (age, sex, QI, etc.).

        Notes
        -----
        self : `SpaceNet` object
            Model selection is via cross-validation with bagging.
        """
        # misc
        self.check_params()
        if self.memory is None or isinstance(self.memory, _basestring):
            self.memory_ = Memory(self.memory,
                                  verbose=max(0, self.verbose - 1))
        else:
            self.memory_ = self.memory
        if self.verbose:
            tic = time.time()

        # nifti masking
        if isinstance(self.mask, NiftiMasker):
            self.masker_ = clone(self.mask)
        else:
            self.masker_ = NiftiMasker(mask_img=self.mask,
                                       target_affine=self.target_affine,
                                       target_shape=self.target_shape,
                                       standardize=self.standardize,
                                       low_pass=self.low_pass,
                                       high_pass=self.high_pass,
                                       mask_strategy='epi', t_r=self.t_r,
                                       memory=self.memory_)
        X = self.masker_.fit_transform(X)

        # misc
        self.Xmean_ = X.mean(axis=0)
        self.Xstd_ = X.std(axis=0)
        self.Xstd_[self.Xstd_ < 1e-8] = 1
        self.mask_img_ = self.masker_.mask_img_
        self.mask_ = self.mask_img_.get_data().astype(np.bool)
        n_samples, _ = X.shape
        y = np.array(y).copy()
        l1_ratios = self.l1_ratios
        if isinstance(l1_ratios, numbers.Number):
            l1_ratios = [l1_ratios]
        alphas = self.alphas
        if isinstance(alphas, numbers.Number):
            alphas = [alphas]
        if not self.loss is None:
            loss = self.loss
        elif self.is_classif:
            loss = "logistic"
        else:
            loss = "mse"

        # set backend solver
        if self.penalty.lower() == "graph-net":
            if not self.is_classif or loss == "mse":
                solver = _graph_net_squared_loss
            else:
                solver = _graph_net_logistic
        else:
            if not self.is_classif or loss == "mse":
                solver = partial(tvl1_solver, loss="mse")
            else:
                solver = partial(tvl1_solver, loss="logistic")

        # generate fold indices
        case1 = (None in [alphas, l1_ratios]) and self.n_alphas > 1
        case2 = (not alphas is None) and min(len(l1_ratios), len(alphas)) > 1
        if case1 or case2:
            self.cv_ = list(check_cv(self.cv, X=X, y=y,
                                     classifier=self.is_classif))
        else:
            # no cross-validation needed, user supplied all params
            self.cv_ = [(np.arange(n_samples), [])]
        n_folds = len(self.cv_)

        # number of problems to solve
        if self.is_classif:
            y = self._binarize_y(y)
        else:
            y = y[:, np.newaxis]
        if self.is_classif and self.n_classes_ > 2:
            n_problems = self.n_classes_
        else:
            n_problems = 1

        # standardize y
        self.ymean_ = np.zeros(y.shape[0])
        if n_problems == 1:
            y = y[:, 0]

        # scores & mean weights map over all folds
        self.cv_scores_ = [[] for i in range(n_problems)]
        w = np.zeros((n_problems, X.shape[1] + 1))
        self.all_coef_ = np.ndarray((n_problems, n_folds, X.shape[1]))

        self.screening_percentile_ = _adjust_screening_percentile(
                self.screening_percentile, self.mask_img_,
                verbose=self.verbose)

        # main loop: loop on classes and folds
        solver_params = dict(tol=self.tol, max_iter=self.max_iter)
        self.best_model_params_ = []
        self.alpha_grids_ = []
        for (test_scores, best_w, best_alpha, best_l1_ratio, alphas,
             y_train_mean, (cls, fold)) in Parallel(
            n_jobs=self.n_jobs, verbose=2 * self.verbose)(
                delayed(self._cache(path_scores, func_memory_level=2))(
                solver, X, y[:, cls] if n_problems > 1 else y, self.mask_,
                alphas, l1_ratios, self.cv_[fold][0], self.cv_[fold][1],
                solver_params, n_alphas=self.n_alphas, eps=self.eps,
                is_classif=self.loss == "logistic", key=(cls, fold),
                debias=self.debias, verbose=self.verbose,
                screening_percentile=self.screening_percentile_,
                ) for cls in range(n_problems) for fold in range(n_folds)):
            self.best_model_params_.append((best_alpha, best_l1_ratio))
            self.alpha_grids_.append(alphas)
            self.ymean_[cls] += y_train_mean
            self.all_coef_[cls, fold] = best_w[:-1]
            if len(np.atleast_1d(l1_ratios)) == 1:
                test_scores = test_scores[0]
            self.cv_scores_[cls].append(test_scores)
            w[cls] += best_w

        # misc
        self.cv_scores_ = np.array(self.cv_scores_)
        self.alpha_grids_ = np.array(self.alpha_grids_)
        self.ymean_ /= n_folds
        if not self.is_classif:
            self.all_coef_ = np.array(self.all_coef_)
            w = w[0]
            self.ymean_ = self.ymean_[0]

        # bagging: average best weights maps over folds
        w /= n_folds

        # set coefs and intercepts
        self._set_coef_and_intercept(w)

        # unmask weights map as a niimg
        self.coef_img_ = self.masker_.inverse_transform(self.coef_)

        # report time elapsed
        if self.verbose:
            duration = time.time() - tic
            print("Time Elapsed: %g seconds, %i minutes." % (
                duration, duration / 60.))

        return self
예제 #55
0
    def _fit(self, X, y):

        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp()
        for jk in xrange(self.n_iter):
            suggestion = self.conn.experiments(self.experiment.id).suggestions().create()
            parameters = suggestion.assignments.to_json()
     
            # convert all unicode names and values to plain strings
            non_unicode_parameters = self._convert_unicode_dict(parameters)

            if self.verbose > 0:
                print "Evaluating params : ",non_unicode_parameters

            # do CV folds in parallel using joblib
            # returns scores on test set
            out = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(
                delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                        train, test, self.verbose, non_unicode_parameters,
                                        self.fit_params, return_parameters=True,
                                        error_score=self.error_score)
                    for train, test in cv)

            # grab scores from results
            scores = [o[0] for o in out]
            self.conn.experiments(self.experiment.id).observations().create(
                suggestion=suggestion.id,
                value=numpy.mean(scores),
                value_stddev=numpy.std(scores)
            )
              
        # return best SigOpt observation so far
        best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation
        self.best_params_ = best_obs.assignments.to_json()
         # convert all unicode names and values to plain strings
        self.best_params_ = self._convert_unicode_dict(self.best_params_)
        self.best_score_ = best_obs.value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
예제 #56
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(
            self.loss_func, self.score_func, self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(fit_grid_point_extended)(
                    X, y, base_estimator, parameters, train, test,
                    self.scorer_, self.verbose, **self.fit_params)
                for parameters in parameter_iterable
                for train, test in cv)
        
#         out = []
#         for parameters in parameter_iterable:
#             fold = 1
#             for train, test in cv:
#                 print "Processing fold", fold, self.fit_params
#                 out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params))
#                 fold += 1

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_extras = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = list()
            for this_score, parameters, this_n_test_samples, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
예제 #57
0
    def fit(self, X, y=None, **params):
        """Run fit with all sets of parameters

        Returns the best classifier

        Parameters
        ----------

        X: array, [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y: array-like, shape = [n_samples], optional
            Target vector relative to X for classification;
            None for unsupervised learning.

        """
        
        import os
        import binascii
        import itertools
        
        self._set_params(**params)
        estimator = self.estimator
        cv = self.cv
        if hasattr(X, 'shape'):
            n_samples = X.shape[0]
        else:
            # support list of unstructured objects on which feature
            # extraction will be applied later in the tranformer chain
            n_samples = len(X)
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        grid = IterGrid(self.param_grid)
        random.shuffle(list(grid))
        base_clf = clone(self.estimator)
        pre_dispatch = self.pre_dispatch
        
        suffix = binascii.hexlify(os.urandom(10))
        
        @parallel.util.interactive
        def push_data(X, y, suffix):
            data = dict(X=X, y=y)
            g = globals()
            g.update({'data_'+suffix : data})
        
        push_ars = []
        ids = self.view.targets or self.view.client.ids
        for id in ids:
            with self.view.temp_flags(targets=[id]):
                push_ars.append(self.view.apply_async(push_data, X, y, suffix))
        
        self._push_results = push_ars
        
        self.view.follow = parallel.Dependency(push_ars, all=False)
        
        ars = []
        rX = parallel.Reference('data_%s["X"]' % suffix)
        ry = parallel.Reference('data_%s["y"]' % suffix)
        for param_id, clf_params in enumerate(grid):
            for train,test in cv:
                ars.append(self.view.apply_async(fit_grid_point,
                        rX, ry, base_clf, clf_params, train, test,
                        self.loss_func, self.score_func, self.verbose,
                        param_id=param_id, **self.fit_params)
                )
        
        # clear folllow dep
        self.view.follow = None
        
        self._fit_results = ars
        return self
예제 #58
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
                      for parameters in parameter_iterable
                      for (train, test) in cv]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        fit_params = self.fit_params
        error_score = self.error_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                                  parameters, fit_params,
                                  return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self