예제 #1
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    score_func=None,
                    cv=None,
                    n_jobs=-1,
                    verbose=0,
                    as_dvalues=False):
    """Evaluate a score by cross-validation.

  Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to
  support computation of decision values.

  """
    X, y = check_arrays(X, y, sparse_format='csr')
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    if score_func is None:
        if not hasattr(estimator, 'score'):
            raise TypeError(
                "If no score_func is specified, the estimator passed "
                "should have a 'score' method. The estimator %s "
                "does not." % estimator)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_cross_val_score)(clone(estimator), X, y, score_func, train,
                                  test, verbose, as_dvalues)
        for train, test in cv)
    return np.array(scores)
예제 #2
0
def Bootstrap_cv(estimator1, estimator2, X, y, score_func, cv=None, n_jobs=1,
                 verbose=0, ratio=.5):
    X, y = cross_validation.check_arrays(X, y, sparse_format='csr')
    cv = cross_validation.check_cv(cv, X, y,
                                   classifier=
                                   cross_validation.is_classifier(estimator1))
    if score_func is None:
        if not hasattr(estimator1, 'score') or \
                not hasattr(estimator2, 'score'):
            raise TypeError(
                "If no score_func is specified, the estimator passed "
                "should have a 'score' method. The estimator %s "
                "does not." % estimator1)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    scores = \
        cross_validation.Parallel(
            n_jobs=n_jobs, verbose=verbose)(
                cross_validation.delayed(
                    dual_cross_val_score)
                (cross_validation.clone(estimator1),
                 cross_validation.clone(estimator2),
                 X, y, score_func, train, test, verbose, ratio)
                for train, test in cv)
    return np.array(scores)
예제 #3
0
    def _grid_search(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator))

        master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params)
        return master.run(train_X, train_y)
예제 #4
0
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None,
                              verbose=0, fit_params=None):


    print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list


    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X = tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)
    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))

    preds_blocks = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten


        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params))

        cross_val_step+=1

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
    def _grid_search_params_iter(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator))

        param_iter = ParameterGrid(self.param_grid)
        LOG.info("Performing grid search over %d configurations" % len(param_iter))

        for fold_id, (train_index, test_index) in enumerate(inner_cv):
            for parameters in param_iter:
                yield fold_id + 1, train_index, test_index, parameters
예제 #6
0
    def fit(self, X, y):
        X, y = check_X_y(X,
                         y,
                         force_all_finite=False,
                         multi_output=self.multi_output)
        _check_param_grid(self.param_grid)

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if comm_rank == 0:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()

        return self
예제 #7
0
    def fit(self, X, y):
        if master:
            LOG.info("comm_size:" + str(comm_size))
        X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr')
        _check_param_grid(self.param_grid)

        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        if master:
            LOG.info("cv length:" + str(len(cv)))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if master:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()
        return self
예제 #8
0
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None,
                verbose=0, fit_params=None):

    print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list
    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X= tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)

    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))
    scorer = cross_validation.check_scoring(estimator, scoring=scoring)
    scores = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten



        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer,
                        train, test, verbose, None, fit_params))

        cross_val_step += 1


    return np.array(scores)[:, 0]
예제 #9
0
def cross_val_score(estimator, X, y=None, score_func=None, cv=None, n_jobs=-1,
    verbose=0, as_dvalues=False):
  """Evaluate a score by cross-validation.

  Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to
  support computation of decision values.

  """
  X, y = check_arrays(X, y, sparse_format='csr')
  cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
  if score_func is None:
      if not hasattr(estimator, 'score'):
          raise TypeError(
              "If no score_func is specified, the estimator passed "
              "should have a 'score' method. The estimator %s "
              "does not." % estimator)
  # We clone the estimator to make sure that all the folds are
  # independent, and that it is pickle-able.
  scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
      delayed(_cross_val_score)(clone(estimator), X, y, score_func, train, test,
          verbose, as_dvalues)
      for train, test in cv)
  return np.array(scores)
예제 #10
0
print "RF+Logit Accuracy: %0.2f (+/- %0.2f)" % (bs_scores.mean(), bs_scores.std() / 2)
bse_scores = Bootstrap_cv(extra_forest, logit, train_data[0::,1::], train_data[0::,0], score_func=precision_score, cv=10, ratio=.8)
print "EF+Logit Accuracy: %0.2f (+/- %0.2f)" % (bse_scores.mean(), bse_scores.std() / 2)
"""
print 'Predicting'
score = []
ratio = .2
estimators = 20
train_size = .7
#output = ratio*forest.predict(test_data) + (1-ratio)*logit.predict(test_data)
#output = extra_forest.predict(test_data)


#Get bootstrapped data
bs = cross_validation.Bootstrap(train_data.shape[0], n_bootstraps=estimators, train_size=train_size, random_state=0)
cv = cross_validation.check_cv(bs, train_data[0::,1::], train_data[0::,0], classifier=cross_validation.is_classifier(extra_forest))
for train, test in cv:
  #Create training data
  X = train_data[0::,1::]
  y = train_data[0::,0]
  #Create estimator
  ef = cross_validation.clone(extra_forest)
  lgi = cross_validation.clone(logit)
  est = Pipeline([('ef', ef), ('logit', lgi)])
  est.fit(X[train], y[train])
  #print est.feature_importances_
  score.append(est.score(X[test], y[test]))

#Format output
score = np.array(score)
    def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None):
        """Do nested cross-validation.

        If ``X_test`` and ``y_test`` are not provided, nested cross-validation using
        ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where
        *K-1* folds are used for training and hyper-parameter selection and the
        remaining fold for testing. The training portion is again split into *T* folds
        to perform a grid-search over hyper-parameters. The parameters that achieved the
        best average performance across the *T* inner cross-validation folds are selected.
        Using these parameters, a model is trained on the entire training data and applied
        to the *K*-th testing fold.

        If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on
        ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above.
        Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and
        applied to ``X_test`` and ``y_test`` for testing.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Feature matrix.

        y : structured array, shape = [n_samples]
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        fit_params : dict
            Additional arguments passed to the fit method.

        predict_params : dict
            Additional arguments passed to the predict method.

        X_test : array-like, shape = [n_test_samples, n_features]
            Hold-out data to perform testing on.

        y_test : array-like or sequence, shape = [n_test_samples]
            Target values of hold-out test data.

        Returns
        -------
        self
        """
        if y.dtype.names is None:
            X, y = check_X_y(X, y)
        else:
            X, event, time = check_arrays_survival(X, y, force_all_finite=False)
            y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        if X_test is not None:
            X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False)
            y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        self._dview, self._lview = self._init_cluster()
        if X_test is None:
            self._fit(X, y, cv, fit_params, predict_params)
        else:
            self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test)

        del self._dview
        del self._lview

        return self