def cross_val_score(estimator, X, y=None, score_func=None, cv=None, n_jobs=-1, verbose=0, as_dvalues=False): """Evaluate a score by cross-validation. Replacement of :func:`sklearn.cross_validation.cross_val_score`, used to support computation of decision values. """ X, y = check_arrays(X, y, sparse_format='csr') cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if score_func is None: if not hasattr(estimator, 'score'): raise TypeError( "If no score_func is specified, the estimator passed " "should have a 'score' method. The estimator %s " "does not." % estimator) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_cross_val_score)(clone(estimator), X, y, score_func, train, test, verbose, as_dvalues) for train, test in cv) return np.array(scores)
def Bootstrap_cv(estimator1, estimator2, X, y, score_func, cv=None, n_jobs=1, verbose=0, ratio=.5): X, y = cross_validation.check_arrays(X, y, sparse_format='csr') cv = cross_validation.check_cv(cv, X, y, classifier= cross_validation.is_classifier(estimator1)) if score_func is None: if not hasattr(estimator1, 'score') or \ not hasattr(estimator2, 'score'): raise TypeError( "If no score_func is specified, the estimator passed " "should have a 'score' method. The estimator %s " "does not." % estimator1) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. scores = \ cross_validation.Parallel( n_jobs=n_jobs, verbose=verbose)( cross_validation.delayed( dual_cross_val_score) (cross_validation.clone(estimator1), cross_validation.clone(estimator2), X, y, score_func, train, test, verbose, ratio) for train, test in cv) return np.array(scores)
def _grid_search(self, train_X, train_y): if callable(self.inner_cv): inner_cv = self.inner_cv(train_X, train_y) else: inner_cv = check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator)) master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params) return master.run(train_X, train_y)
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None, verbose=0, fit_params=None): print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X = tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) preds_blocks = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y, train, test, verbose, fit_params)) cross_val_step+=1 preds = [p for p, _ in preds_blocks] locs = np.concatenate([loc for _, loc in preds_blocks]) if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_locs = np.empty(len(locs), dtype=int) inv_locs[locs] = np.arange(len(locs)) # Check for sparse predictions if sp.issparse(preds[0]): preds = sp.vstack(preds, format=preds[0].format) else: preds = np.concatenate(preds) return preds[inv_locs]
def _grid_search_params_iter(self, train_X, train_y): if callable(self.inner_cv): inner_cv = self.inner_cv(train_X, train_y) else: inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator)) param_iter = ParameterGrid(self.param_grid) LOG.info("Performing grid search over %d configurations" % len(param_iter)) for fold_id, (train_index, test_index) in enumerate(inner_cv): for parameters in param_iter: yield fold_id + 1, train_index, test_index, parameters
def fit(self, X, y): X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) _check_param_grid(self.param_grid) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if comm_rank == 0: self._fit_master(X, y, cv) else: self._fit_slave() return self
def fit(self, X, y): if master: LOG.info("comm_size:" + str(comm_size)) X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr') _check_param_grid(self.param_grid) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) if master: LOG.info("cv length:" + str(len(cv))) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if master: self._fit_master(X, y, cv) else: self._fit_slave() return self
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None, verbose=0, fit_params=None): print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X= tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) scorer = cross_validation.check_scoring(estimator, scoring=scoring) scores = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer, train, test, verbose, None, fit_params)) cross_val_step += 1 return np.array(scores)[:, 0]
print "RF+Logit Accuracy: %0.2f (+/- %0.2f)" % (bs_scores.mean(), bs_scores.std() / 2) bse_scores = Bootstrap_cv(extra_forest, logit, train_data[0::,1::], train_data[0::,0], score_func=precision_score, cv=10, ratio=.8) print "EF+Logit Accuracy: %0.2f (+/- %0.2f)" % (bse_scores.mean(), bse_scores.std() / 2) """ print 'Predicting' score = [] ratio = .2 estimators = 20 train_size = .7 #output = ratio*forest.predict(test_data) + (1-ratio)*logit.predict(test_data) #output = extra_forest.predict(test_data) #Get bootstrapped data bs = cross_validation.Bootstrap(train_data.shape[0], n_bootstraps=estimators, train_size=train_size, random_state=0) cv = cross_validation.check_cv(bs, train_data[0::,1::], train_data[0::,0], classifier=cross_validation.is_classifier(extra_forest)) for train, test in cv: #Create training data X = train_data[0::,1::] y = train_data[0::,0] #Create estimator ef = cross_validation.clone(extra_forest) lgi = cross_validation.clone(logit) est = Pipeline([('ef', ef), ('logit', lgi)]) est.fit(X[train], y[train]) #print est.feature_importances_ score.append(est.score(X[test], y[test])) #Format output score = np.array(score)
def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None): """Do nested cross-validation. If ``X_test`` and ``y_test`` are not provided, nested cross-validation using ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where *K-1* folds are used for training and hyper-parameter selection and the remaining fold for testing. The training portion is again split into *T* folds to perform a grid-search over hyper-parameters. The parameters that achieved the best average performance across the *T* inner cross-validation folds are selected. Using these parameters, a model is trained on the entire training data and applied to the *K*-th testing fold. If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above. Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and applied to ``X_test`` and ``y_test`` for testing. Parameters ---------- X : array-like, shape = [n_samples, n_features] Feature matrix. y : structured array, shape = [n_samples] A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. fit_params : dict Additional arguments passed to the fit method. predict_params : dict Additional arguments passed to the predict method. X_test : array-like, shape = [n_test_samples, n_features] Hold-out data to perform testing on. y_test : array-like or sequence, shape = [n_test_samples] Target values of hold-out test data. Returns ------- self """ if y.dtype.names is None: X, y = check_X_y(X, y) else: X, event, time = check_arrays_survival(X, y, force_all_finite=False) y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) if X_test is not None: X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False) y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)]) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self._dview, self._lview = self._init_cluster() if X_test is None: self._fit(X, y, cv, fit_params, predict_params) else: self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test) del self._dview del self._lview return self