def _fit_X_y(self, X_clf, y_clf, X_cal, y_cal): clf = clone(self.base_estimator) if isinstance(clf, RegressorMixin): clf = as_classifier(clf) clf.fit(X_clf, y_clf) if self.calibration is None: return clf, None, None else: if self.calibration == "kde": cal_num = KernelDensity() cal_den = KernelDensity() elif self.calibration == "histogram": cal_num = Histogram(bins=100, range=[(0.0, 1.0)]) cal_den = Histogram(bins=100, range=[(0.0, 1.0)]) else: cal_num = clone(self.calibration) cal_den = clone(self.calibration) X_num = clf.predict_proba(X_cal[y_cal == 0])[:, 0] X_den = clf.predict_proba(X_cal[y_cal == 1])[:, 0] cal_num.fit(X_num.reshape(-1, 1)) cal_den.fit(X_den.reshape(-1, 1)) return clf, cal_num, cal_den
def pool_entropy_h(X, y, candidate_mask, train_mask, classifier, n_candidates, pool_n, n_jobs=-1, **kwargs): """ Return the candidate that will minimise the expected entropy of the predictions. Parameters ---------- X_training_candidates : array The feature matrix of the potential training candidates. classes : int The name of classes. pool_n : int The size of the sampel pool used in estimating the entropy n_jobs : int The number of parallel jobs (-1 if want to use all cores) Returns ------- best_candidate : int The index of the best candidate. """ classes = classifier.classes_ # sorted lexicographically n_classes = len(classes) candidate_size = np.sum(train_mask) n_features = X.shape[1] entropy = np.empty(len(candidate_mask)) entropy[:] = np.inf # the probabilities used to calculate expected value of pool probs = classifier.predict_proba(X[candidate_mask]) # copy the classifier (avoid modifying the original classifier) classifier_plus = clone(classifier) # construct the sample pool (used to estimate the entropy) unlabelled_indices = np.where(-train_mask)[0] pool_indices = permutation(unlabelled_indices)[:pool_n] pool_mask = np.zeros(len(candidate_mask), dtype=bool) pool_mask[pool_indices] = True # let's look at each candidate candidate_indices = np.where(candidate_mask)[0] results = Parallel(n_jobs=n_jobs)(delayed(_parallel_entropy_estimate)( X, y.copy(), train_mask.copy(), pool_mask, clone(classifier_plus), classes, n_classes, probs, i, index) for i, index in enumerate(candidate_indices)) indices, expected = zip(*results) indices, expected = np.asarray(indices), np.asarray(expected) assert not np.isnan(expected).any(), 'Some expected values are undefined.' entropy[indices] = expected # pick the candidate with the smallest expected entropy best_candidates = np.argsort(entropy)[:n_candidates] return best_candidates
def nn_embedding_translate(words=en_2_es.keys(), embedding1=en_embedding, embedding2=es_embedding, constraint=es_2_en.keys(), k=5, pre_transform=None, log=False): if pre_transform is not None: pre_transform_1 = clone(pre_transform) pre_transform_2 = clone(pre_transform) embedding1 = transform(embedding1, pre_transform_1) embedding2 = transform(embedding2, pre_transform_2) if constraint is not None: embedding2 = sub_embedding(embedding2, constraint) in_vocab_words = [word for word in words if embedding1.normalize(word) is not None] if log: print "{} of {} words in vocab".format(len(in_vocab_words), len(words)) output = {} for i, word in enumerate(in_vocab_words): if log and i % 100 == 0: print "{} of {} words".format(i, len(words)) emb = embedding1.word_to_embedding(word) if emb is not None: trans = embedding2.words_closest_to_point(emb, k=k) trans = softmax(trans) output[word] = trans return output
def fit(self, X): param_grid = list(ParameterGrid(self.param_grid)) n_folds = len(self.cv) n_grid = len(param_grid) scores = np.zeros((n_folds, n_grid), dtype=np.float64) for i, (X_tr, X_te) in enumerate(self.cv.split(X)): for j, params in enumerate(param_grid): estimator = clone(self.estimator) estimator.set_params(**params) estimator.fit(X_tr) scores[i, j] = estimator.score(X_te) # FIXME: handle higher is better as well. best = scores.mean(axis=0).argmin() self.best_params_ = param_grid[best] # Refit if self.refit: self.best_estimator_ = clone(self.estimator) self.best_estimator_.set_params(**self.best_params_) self.best_estimator_.fit(X) return self
def test_missing_value_handling(est, func, support_sparse): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan X_train, X_test = train_test_split(X, random_state=1) # sanity check assert not np.all(np.isnan(X_train), axis=0).any() assert np.any(np.isnan(X_train), axis=0).all() assert np.any(np.isnan(X_test), axis=0).all() X_test[:, 0] = np.nan # make sure this boundary case is tested Xt = est.fit(X_train).transform(X_test) # missing values should still be missing, and only them assert_array_equal(np.isnan(Xt), np.isnan(X_test)) # check that the function leads to the same results as the class Xt_class = est.transform(X_train) Xt_func = func(X_train, **est.get_params()) assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) # check that the inverse transform keep NaN Xt_inv = est.inverse_transform(Xt) assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) # FIXME: we can introduce equal_nan=True in recent version of numpy. # For the moment which just check that non-NaN values are almost equal. assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) for i in range(X.shape[1]): # train only on non-NaN est.fit(_get_valid_samples_by_column(X_train, i)) # check transforming with NaN works even when training without NaN Xt_col = est.transform(X_test[:, [i]]) assert_array_equal(Xt_col, Xt[:, [i]]) # check non-NaN is handled as before - the 1st column is all nan if not np.isnan(X_test[:, i]).all(): Xt_col_nonan = est.transform( _get_valid_samples_by_column(X_test, i)) assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) if support_sparse: est_dense = clone(est) est_sparse = clone(est) Xt_dense = est_dense.fit(X_train).transform(X_test) Xt_inv_dense = est_dense.inverse_transform(Xt_dense) for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix, sparse.bsr_matrix, sparse.coo_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): # check that the dense and sparse inputs lead to the same results Xt_sparse = (est_sparse.fit(sparse_constructor(X_train)) .transform(sparse_constructor(X_test))) assert_allclose(Xt_sparse.A, Xt_dense) Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse) assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
def test_sklearn_clone(): tm._skip_if_no_sklearn() from sklearn.base import clone clf = xgb.XGBClassifier(n_jobs=2, nthread=3) clf.n_jobs = -1 clone(clf)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, StrMeasure:str, std:bool = False, N:int = 0): Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int) Data = Data[:, np.newaxis] for f in FeaUsed: T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float)) if len(T.shape) < 2: T = T[:, np.newaxis] Data = np.concatenate((Data, T), axis = 1) if N > 0: Data = Data[:N, :] Lbl = Data[:, 0] Fea = Data[:,1:] if std: scaler = preprocessing.StandardScaler() Fea = scaler.fit_transform(Fea) Model = base.clone(Models[StrModel]) SetParam(Model, Param) Model.fit(Fea, Lbl) Pred = Model.predict(Fea) st = Measures[StrMeasure](Lbl, Pred) sv = cross_validation.cross_val_score(base.clone(Models[StrModel]), Fea, Lbl, metrics.make_scorer(Measures[StrMeasure]), cv = 5, n_jobs = 5) return st, np.mean(sv)
def run_classifier(out_folder, trend_probs, referrers, y, train, test): F = referrers #static features etree = create_grid_search('lr', n_jobs = 1) y_pred = trend_probs[test].argmax(axis=1) save_results(out_folder, 'tl-base-lr', y_pred, y[test]) aux = clone(etree) aux.fit(F[train], y[train]) y_pred = aux.predict(F[test]) save_results(out_folder, 'tree-feats', y_pred, y[test]) aux = clone(etree) aux.fit(trend_probs[train], y[train]) y_pred = aux.predict(trend_probs[test]) save_results(out_folder, 'tree-probs', y_pred, y[test]) C = np.hstack((F, trend_probs)) aux = clone(etree) aux.fit(C[train], y[train]) y_pred = aux.predict(C[test]) save_results(out_folder, 'meta-combine', y_pred, y[test]) #stack_clf = stacking.Stacking(3, [etree], 'tree') #stack_clf.fit(F[train], y[train], trend_probs[train]) #y_pred = stack_clf.predict(F[test], trend_probs[test]) #save_results(out_folder, 'meta-stack-tree', y_pred) stack_clf = stacking.Stacking(3, [etree], 'linear') stack_clf.fit(F[train], y[train], trend_probs[train]) y_pred = stack_clf.predict(F[test], trend_probs[test]) save_results(out_folder, 'meta-stack-linear', y_pred, y[test])
def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_jobs=self.n_jobs, ratio=self.ratio) if self.enn is not None: if isinstance(self.enn, EditedNearestNeighbours): self.enn_ = clone(self.enn) else: raise ValueError('enn needs to be an EditedNearestNeighbours.' ' Got {} instead.'.format(type(self.enn))) # Otherwise create a default EditedNearestNeighbours else: self.enn_ = EditedNearestNeighbours( sampling_strategy='all', n_jobs=self.n_jobs)
def _fit_calibrators(self, df0, df1): df0 = df0.reshape(-1, 1) df1 = df1.reshape(-1, 1) if self.method == "kde": calibrator0 = KernelDensity() calibrator1 = KernelDensity() elif self.method == "histogram": eps = 0.05 df_min = max(0, min(np.min(df0), np.min(df1)) - eps) df_max = min(1, max(np.max(df0), np.max(df1)) + eps) calibrator0 = Histogram(bins=10 + int(len(df0) ** (1. / 3.)), range=[(df_min, df_max)], interpolation="linear") calibrator1 = Histogram(bins=10 + int(len(df0) ** (1. / 3.)), range=[(df_min, df_max)], interpolation="linear") else: calibrator0 = clone(self.method) calibrator1 = clone(self.method) calibrator0.fit(df0) calibrator1.fit(df1) return calibrator0, calibrator1
def train(self, training_trackers, # type: List[DialogueStateTracker] domain, # type: Domain **kwargs # type: **Any ): # type: (...) -> Dict[Text: Any] training_data = self.featurize_for_training(training_trackers, domain, **kwargs) X, y = self._extract_training_data(training_data) model = self.model_architecture(**kwargs) score = None # Note: clone is called throughout to avoid mutating default # arguments. self.label_encoder = clone(self.label_encoder).fit(y) Xt, yt = self._preprocess_data(X, y) if self.cv is None: model = clone(model).fit(Xt, yt) else: param_grid = self.param_grid or {} model, score = self._search_and_score( model, Xt, yt, param_grid) self.model = model logger.info("Done fitting sklearn policy model") if score is not None: logger.info("Cross validation score: {:.5f}".format(score))
def test_kernel_clone_after_set_params(): # This test is to verify that using set_params does not # break clone on kernels. # This used to break because in kernels such as the RBF, non-trivial # logic that modified the length scale used to be in the constructor # See https://github.com/scikit-learn/scikit-learn/issues/6961 # for more details. bounds = (1e-5, 1e5) for kernel in kernels: kernel_cloned = clone(kernel) params = kernel.get_params() # RationalQuadratic kernel is isotropic. isotropic_kernels = (ExpSineSquared, RationalQuadratic) if 'length_scale' in params and not isinstance(kernel, isotropic_kernels): length_scale = params['length_scale'] if np.iterable(length_scale): params['length_scale'] = length_scale[0] params['length_scale_bounds'] = bounds else: params['length_scale'] = [length_scale] * 2 params['length_scale_bounds'] = bounds * 2 kernel_cloned.set_params(**params) kernel_cloned_clone = clone(kernel_cloned) assert_equal(kernel_cloned_clone.get_params(), kernel_cloned.get_params()) assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned)) yield (check_hyperparameters_equal, kernel_cloned, kernel_cloned_clone)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, std:bool = False, N:int = 0): Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int) Data = Data[:, np.newaxis] for f in FeaUsed: T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float)) if len(T.shape) < 2: T = T[:, np.newaxis] Data = np.concatenate((Data, T), axis = 1) if N > 0: Data = Data[:N, :] Lbl = Data[:, 0] Fea = Data[:,1:] if std: scaler = preprocessing.StandardScaler() Fea = scaler.fit_transform(Fea) Model = base.clone(Models[StrModel]) SetParam(Model, Param) Model.fit(Fea, Lbl) Pred = Model.predict_proba(Fea)[:, 1] st = metrics.precision_recall_curve(Lbl, Pred) Folds = cross_validation.KFold(Fea.shape[0], n_folds = 5) for train, valid in Folds: Model = base.clone(Models[StrModel]) SetParam(Model, Param) Model.fit(Fea[train], Lbl[train]) Pred[valid] = Model.predict_proba(Fea[valid])[:, 1] sv = metrics.precision_recall_curve(Lbl, Pred) return st, sv
def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = clone(self.smote) else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) # Otherwise create a default SMOTE else: self.smote_ = SMOTE( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = clone(self.tomek) else: raise ValueError('tomek needs to be a TomekLinks object.' 'Got {} instead.'.format(type(self.tomek))) # Otherwise create a default TomekLinks else: self.tomek_ = TomekLinks(sampling_strategy='all')
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def make_classifiers(method, balanced, labels, selectors=None, columns=None, random_state=None): estimators = {} class_weight = None if balanced: class_weight = 'balanced' # Make appropriate delegatation if 'lr' in method: estimator = LogisticRegression(n_jobs=1) elif 'svm' in method: estimator = SVC(probability=False) elif 'rf' in method: estimator = RandomForestClassifier(n_jobs=1) else: raise ValueError("Not implemented for method {}".format(method)) estimator = estimator.set_params(**{'class_weight': class_weight, 'random_state': random_state}) if hasattr(estimator, 'n_jobs'): estimator.set_params(**{'n_jobs': 1}) if 'bagged' in method: for l in labels: named_estimators = zip(columns, [clone(estimator) for _ in columns]) weights = [1] * len(columns) estimators[l] = HybridFeatureVotingClassifier( named_estimators, selectors, voting='soft', weights=weights, n_jobs=4 ) else: for l in labels: estimators[l] = clone(estimator) return estimators
def fit(self, X, y): """Fit the shape function of each features with the backfitting algorithm. Please note that the shape functions are centered (not reduced). Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. Returns ------- self : object The Generalized Additive Model with the fitted shape functions """ n_samples, n_features = X.shape if not isinstance(self.smoothers, list): self.smoothers_ = [clone(self.smoothers) for i in range(n_features) ] self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False) else: self.smoothers_ = [clone(self.smoothers[j]) for j in range(n_features) ] self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False) self.y_mean_ = np.mean(y) self.rmse_ = [] # array to stock the train error over the iteration y -= y.mean() temp = np.zeros(shape=(n_samples, n_features)) # array to stock the shape function for re-use in the next iteration shape_functions = np.zeros(shape=(n_samples, n_features)) for i in range(self.max_iter): for j in range(n_features): # select all the columns except the j-th one idx = list(set(np.arange(0, n_features, 1)) - set([j])) #Compute the residuals of the previous iteration residuals = y.reshape((n_samples,1)) - temp[:, idx].sum(axis=1, keepdims=True).reshape((n_samples, 1)) residuals -=residuals.mean() residuals = residuals #print(np.amin(residuals), np.amax(residuals), 'iteration number %s'%(i+1)) self.smoothers_[j].fit(X[:, j:j+1], residuals.reshape((n_samples,))) #reshape cause deprecation warning shape_functions[:, j]= self.smoothers_[j].predict(X[:, j:j+1]) shape_functions[:, j] -= shape_functions[:, j].mean() # RidgeRegression on top of the shape function in order to 're-scale' each shape functions self.ridge.fit(shape_functions, y) coef = self.ridge.coef_ shape_functions *= coef y_pred = shape_functions.sum(axis=1) y_pred -= y_pred.mean() self.rmse_.append(met.mean_squared_error(y_pred, y)) temp=shape_functions.copy() #plt.scatter(1, np.abs(residuals.min()), c='g', label='iteration = %s'%i) #plt.scatter(2, np.abs(residuals.max()), c='r') #plt.legend() #plt.show() return self
def pool_variance_h(X, y, candidate_mask, train_mask, classifier, n_candidates, pool_n, C, n_jobs=-1, random_state=None, **kwargs): """ Return the candidate that will minimise the expected variance of the predictions. Parameters ---------- X_training_candidates : array The feature matrix of the potential training candidates. C : float The regularisation parameter of Logistic Regression. pool_sample_size : int The size of the sample which will be used to estimate the variance/entropy. n_jobs : int The number of parallel jobs (-1 if want to use all cores) Returns ------- best_candidate : int The index of the best candidate. """ classes = classifier.classes_ # sorted lexicographically n_classes = len(classes) n_features = X.shape[1] variance = np.empty(len(candidate_mask)) variance[:] = np.inf rng = RandomState(random_state) # the probabilities used to calculate expected value of pool probs = classifier.predict_proba(X[candidate_mask]) # copy the classifier (avoid modifying the original classifier) classifier_plus = clone(classifier) # construct the sample pool (used to estimate the variance) unlabelled_indices = np.where(-train_mask)[0] pool_indices = rng.permutation(unlabelled_indices)[:pool_n] pool_mask = np.zeros(len(candidate_mask), dtype=bool) pool_mask[pool_indices] = True # let's look at each candidate candidate_indices = np.where(candidate_mask)[0] results = Parallel(n_jobs=n_jobs)(delayed(_parallel_variance_estimate)( X, y.copy(), train_mask.copy(), pool_mask, clone(classifier_plus), classes, n_classes, probs, i, index, C) for i, index in enumerate(candidate_indices)) indices, expected = zip(*results) indices, expected = np.asarray(indices), np.asarray(expected) assert not np.isnan(expected).any(), 'Some expected values are undefined.' variance[indices] = expected # pick the candidate with the smallest expected variance best_candidates = np.argsort(variance)[:n_candidates] return best_candidates
def test_all_estimators(): # Test that estimators are default-constructible, clonable # and have working repr. estimators = all_estimators(include_meta_estimators=True) classifier = LDA() for name, Estimator in estimators: # some can just not be sensibly default constructed if name in dont_test: continue # test default-constructibility # get rid of deprecation warnings with warnings.catch_warnings(record=True): if name in meta_estimators: estimator = Estimator(classifier) else: estimator = Estimator() # test cloning clone(estimator) # test __repr__ repr(estimator) # test that set_params returns self assert_true(isinstance(estimator.set_params(), Estimator)) # test if init does nothing but set parameters # this is important for grid_search etc. # We get the default parameters from init and then # compare these against the actual values of the attributes. # this comes from getattr. Gets rid of deprecation decorator. init = getattr(estimator.__init__, 'deprecated_original', estimator.__init__) try: args, varargs, kws, defaults = inspect.getargspec(init) except TypeError: # init is not a python function. # true for mixins continue params = estimator.get_params() if name in meta_estimators: # they need a non-default argument args = args[2:] else: args = args[1:] if args: # non-empty list assert_equal(len(args), len(defaults)) else: continue for arg, default in zip(args, defaults): if arg not in params.keys(): # deprecated parameter, not in get_params assert_true(default is None) continue if isinstance(params[arg], np.ndarray): assert_array_equal(params[arg], default) else: assert_equal(params[arg], default)
def test_clone(): lr = LinearRegression() svr_rbf = SVR(kernel='rbf') ridge = Ridge(random_state=1) stregr = StackingCVRegressor(regressors=[lr, ridge], meta_regressor=svr_rbf, store_train_meta_features=True) clone(stregr)
def test_clone(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) clone(stclf)
def test_clone(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) clone(stclf)
def test_clone_empty_array(): # Regression test for cloning estimators with empty arrays clf = MyEstimator(empty=np.array([])) clf2 = clone(clf) assert_array_equal(clf.empty, clf2.empty) clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]]))) clf2 = clone(clf) assert_array_equal(clf.empty.data, clf2.empty.data)
def test_clone(): mlp = MLP(epochs=5, eta=0.05, hidden_layers=[10], minibatches=len(y), random_seed=1) clone(mlp)
def test_clone(): clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) clone(eclf)
def test_clone(): from sklearn.base import clone a = mcmc.FMRegression() b = clone(a) assert a.get_params() == b.get_params() a = mcmc.FMClassification() b = clone(a) assert a.get_params() == b.get_params()
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def fit(self, X, y): """Actual fitting, performing the search over parameters.""" parameter_iterable = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring, parameters, cv=cv) for parameters in parameter_iterable) best = sorted(out, reverse=True)[0] self.best_params_ = best[1] self.best_score_ = best[0] if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best[1]) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def check_parameters_default_constructible(name, Estimator): classifier = LDA() # test default-constructibility # get rid of deprecation warnings with warnings.catch_warnings(record=True): if name in META_ESTIMATORS: estimator = Estimator(classifier) else: estimator = Estimator() # test cloning clone(estimator) # test __repr__ repr(estimator) # test that set_params returns self assert_true(estimator.set_params() is estimator) # test if init does nothing but set parameters # this is important for grid_search etc. # We get the default parameters from init and then # compare these against the actual values of the attributes. # this comes from getattr. Gets rid of deprecation decorator. init = getattr(estimator.__init__, 'deprecated_original', estimator.__init__) try: args, varargs, kws, defaults = inspect.getargspec(init) except TypeError: # init is not a python function. # true for mixins return params = estimator.get_params() if name in META_ESTIMATORS: # they need a non-default argument args = args[2:] else: args = args[1:] if args: # non-empty list assert_equal(len(args), len(defaults)) else: return for arg, default in zip(args, defaults): assert_in(type(default), [str, int, float, bool, tuple, type(None), np.float64, types.FunctionType, Memory]) if arg not in params.keys(): # deprecated parameter, not in get_params assert_true(default is None) continue if isinstance(params[arg], np.ndarray): assert_array_equal(params[arg], default) else: assert_equal(params[arg], default)
def plot_silhouette(clf, X, title='Silhouette Analysis', metric='euclidean', copy=True, ax=None, figsize=None, title_fontsize="large", text_fontsize="medium"): """Plots silhouette analysis of clusters using fit_predict. Args: clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods. X (array-like, shape (n_samples, n_features)): Data to cluster, where n_samples is the number of samples and n_features is the number of features. title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis" metric (string or callable, optional): The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a copy of **clf**. ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6). Defaults to ``None``. title_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "large". text_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium". Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot.plotters as skplt >>> kmeans = KMeans(n_clusters=4, random_state=1) >>> skplt.plot_silhouette(kmeans, X) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_silhouette.png :align: center :alt: Silhouette Plot """ if copy: clf = clone(clf) cluster_labels = clf.fit_predict(X) n_clusters = len(set(cluster_labels)) silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.set_xlim([-0.1, 1]) ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10]) ax.set_xlabel('Silhouette coefficient values', fontsize=text_fontsize) ax.set_ylabel('Cluster label', fontsize=text_fontsize) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=text_fontsize) y_lower = y_upper + 10 ax.axvline(x=silhouette_avg, color="red", linestyle="--", label='Silhouette score: {0:0.3f}'.format(silhouette_avg)) ax.set_yticks([]) # Clear the y-axis labels / ticks ax.set_xticks(np.arange(-0.1, 1.0, 0.2)) ax.tick_params(labelsize=text_fontsize) ax.legend(loc='best', fontsize=text_fontsize) return ax
def plot_elbow_curve(clf, X, title='Elbow Plot', cluster_ranges=None, ax=None, figsize=None, title_fontsize="large", text_fontsize="medium"): """Plots elbow curve of different values of K for KMeans clustering. Args: clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods and an ``n_clusters`` parameter. X (array-like, shape (n_samples, n_features)): Data to cluster, where n_samples is the number of samples and n_features is the number of features. title (string, optional): Title of the generated plot. Defaults to "Elbow Plot" cluster_ranges (None or :obj:`list` of int, optional): List of n_clusters for which to plot the explained variances. Defaults to ``range(1, 12, 2)``. copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a copy of **clf**. ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6). Defaults to ``None``. title_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "large". text_fontsize (string or int, optional): Matplotlib-style fontsizes. Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium". Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot.plotters as skplt >>> kmeans = KMeans(random_state=1) >>> skplt.plot_elbow_curve(kmeans, cluster_ranges=range(1, 11)) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_elbow_curve.png :align: center :alt: Elbow Curve """ if cluster_ranges is None: cluster_ranges = range(1, 12, 2) else: cluster_ranges = sorted(cluster_ranges) if not hasattr(clf, 'n_clusters'): raise TypeError('"n_clusters" attribute not in classifier. ' 'Cannot plot elbow method.') clfs = [] for i in cluster_ranges: current_clf = clone(clf) setattr(current_clf, "n_clusters", i) clfs.append(current_clf.fit(X)) centroids = [k.cluster_centers_ for k in clfs] D_k = [cdist(X, cent, 'euclidean') for cent in centroids] dist = [np.min(D, axis=1) for D in D_k] # avgWithinSS = [np.sum(d)/X.shape[0] for d in dist] wcss = [np.sum(d**2) for d in dist] tss = np.sum(pdist(X)**2) / X.shape[0] bss = tss - wcss if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) ax.set_title(title, fontsize=title_fontsize) ax.plot(cluster_ranges, bss / tss * 100, 'b*-') ax.grid(True) ax.set_xlabel('Number of clusters', fontsize=text_fontsize) ax.set_ylabel('Percent variance explained', fontsize=text_fontsize) ax.tick_params(labelsize=text_fontsize) return ax
def _init_projectors(self, X): self.projectors = [clone(bproj) for bproj in self.bprojs]
def learning_curve(estimator, X_train, X_valid, score="f1", train_sizes=None, hparams=None, shuffle=False, random_state=None): """ Given training and validation data, this function produces learning curves (as lists of scores) for a given estimator. Args: estimator: a model to be inspected X_train (list(AnnotatedDocument)): training data X_valid (list(AnnotatedDocument)): validation data score: the type of scores to be produced, one of {'precision', 'recall', 'f1'} train_sizes (list(float)): relative sizes of training subsets hparams (dict): hyper-parameters to be passed to a model for training shuffle (bool): if True, training data is shuffled random_state (int): used when shuffle=True to ensure reproducible results Returns: train_sizes (list(float)): relative sizes of training subsets train_scores (list(float)): model scores on training subsets of respective sizes valid_scores (list(float)): model scores on validation data """ # check model type if isinstance(estimator, NamedEntityRecognitionModel): annotation_type = "annotation" if isinstance(estimator, ModelEnsembleNER): annotation_labels = set() for model in estimator.models: annotation_labels.update(model.entity_labels) annotation_labels = list(annotation_labels) else: annotation_labels = estimator.entity_labels elif isinstance(estimator, RelationExtractionModel): annotation_type = "relation" if isinstance(estimator, REModelEnsemble): annotation_labels = set() for model in estimator.models: annotation_labels.update(model.relation_labels) annotation_labels = list(annotation_labels) else: annotation_labels = estimator.relation_labels else: raise TypeError( "Given estimator is of type '{}' which is not supported".format( type(estimator))) # determine annotation label if annotation_labels: if len(annotation_labels) > 1: log.debug( "Learning curves currently support either one label or all labels: building for all labels" ) annotation_label = None else: annotation_label = annotation_labels[0] else: annotation_label = None # make default train sizes as fractions if not train_sizes: train_sizes = [s * 0.1 for s in range(1, 11)] # shuffle training data if necessary if shuffle: if random_state: random.Random(random_state).shuffle(X_train) else: random.shuffle(X_train) # collect scores for each training subset train_scores = [] valid_scores = [] for train_size in train_sizes: docs_to_train = X_train[:int(train_size * len(X_train))] if not docs_to_train: log.debug("No documents to train: check your train sizes") base_estimator = clone(estimator) if hparams: base_estimator.fit(X=docs_to_train, y=None, **hparams) else: base_estimator.fit(X=docs_to_train, y=None) X_train_pred = base_estimator.transform(docs_to_train) X_valid_pred = base_estimator.transform(X_valid) score_train = annotation_precision_recall_f1score( X_train_pred, docs_to_train, ann_label=annotation_label, ann_type=annotation_type) score_valid = annotation_precision_recall_f1score( X_valid_pred, X_valid, ann_label=annotation_label, ann_type=annotation_type) if score == "precision": train_scores.append(score_train[0]) valid_scores.append(score_valid[0]) elif score == "recall": train_scores.append(score_train[1]) valid_scores.append(score_valid[1]) elif score == "f1": train_scores.append(score_train[2]) valid_scores.append(score_valid[2]) else: raise ValueError( "Cannot determine the type of scoring '{}'".format(score)) return train_sizes, train_scores, valid_scores
def test_clonable(est): # fit it, then clone it est.fit(y) est2 = clone(est) assert isinstance(est2, est.__class__) assert est is not est2
def _fit(self, X, y, feature_axis=2): X, y = check_ts_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[feature_axis] if self.max_features is not None: if not isinstance(self.max_features, numbers.Integral): raise TypeError( "'max_features' should be an integer between 1 and {} features." " Got {!r} instead.".format(n_features, self.max_features)) elif self.max_features < 1 or self.max_features > n_features: raise ValueError( "'max_features' should be between 1 and {} features." " Got {} instead.".format(n_features, self.max_features)) max_features = self.max_features else: max_features = n_features if not isinstance(self.n_gen_no_change, (numbers.Integral, np.integer, type(None))): raise ValueError( "'n_gen_no_change' should either be None or an integer." " {} was passed.".format(self.n_gen_no_change)) estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, max_features=max_features, caching=self.caching, feature_axis=feature_axis) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs == 0: raise ValueError("n_jobs == 0 has no meaning.") elif self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool( processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") _, log = _eaFunction(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, ngen_no_change=self.n_gen_no_change, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) _X = apply_mask(X, support_, feature_axis=feature_axis) self.estimator_.fit(_X, y) self.generation_scores_ = np.array( [score for score, _ in log.select("max")]) self.n_features_ = support_.sum() self.support_ = support_ return self
def _validate_estimator(self): # FIXME: in 0.6 call super() SparseBaseSMOTE._validate_estimator(self) # FIXME: remove in 0.6 after deprecation cycle if self.kind != 'deprecated' and not (self.kind == 'borderline-1' or self.kind == 'borderline-2'): if self.kind not in SMOTE_KIND: raise ValueError('Unknown kind for SMOTE algorithm.' ' Choices are {}. Got {} instead.'.format( SMOTE_KIND, self.kind)) else: warnings.warn( '"kind" is deprecated in 0.4 and will be ' 'removed in 0.6. Use SMOTE, BorderlineSMOTE or ' 'SVMSMOTE instead.', DeprecationWarning) if self.kind == 'borderline1' or self.kind == 'borderline2': self._sample = types.MethodType(SparseBorderlineSMOTE._sample, self) self.kind = ('borderline-1' if self.kind == 'borderline1' else 'borderline-2') elif self.kind == 'svm': self._sample = types.MethodType(SparseSVMSMOTE._sample, self) if self.out_step == 'deprecated': self.out_step = 0.5 else: warnings.warn( '"out_step" is deprecated in 0.4 and will ' 'be removed in 0.6. Use SVMSMOTE class ' 'instead.', DeprecationWarning) if self.svm_estimator == 'deprecated': warnings.warn( '"svm_estimator" is deprecated in 0.4 and ' 'will be removed in 0.6. Use SVMSMOTE class ' 'instead.', DeprecationWarning) if (self.svm_estimator is None or self.svm_estimator == 'deprecated'): self.svm_estimator_ = SVC(gamma='scale', random_state=self.random_state) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = clone(self.svm_estimator) else: raise_isinstance_error('svm_estimator', [SVC], self.svm_estimator) if self.kind != 'regular': if self.m_neighbors == 'deprecated': self.m_neighbors = 10 else: warnings.warn( '"m_neighbors" is deprecated in 0.4 and ' 'will be removed in 0.6. Use SVMSMOTE class ' 'or BorderlineSMOTE instead.', DeprecationWarning) self.nn_m_ = check_neighbors_object('m_neighbors', self.m_neighbors, additional_neighbor=1) self.nn_m_.set_params(**{'n_jobs': self.n_jobs})
def __init__(self, *, model_y='auto', model_t='auto', featurizer=None, discrete_treatment=False, categories='auto', cv=2, n_crossfit_splits='raise', mc_iters=None, mc_agg='mean', n_estimators=100, criterion="mse", max_depth=None, min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0., min_var_fraction_leaf=None, min_var_leaf_on_val=True, max_features="auto", min_impurity_decrease=0., max_samples=.45, min_balancedness_tol=.45, honest=True, inference=True, fit_intercept=True, subforest_size=4, n_jobs=-1, random_state=None, verbose=0, warm_start=False): # TODO: consider whether we need more care around stateful featurizers, # since we clone it and fit separate copies self.model_y = clone(model_y, safe=False) self.model_t = clone(model_t, safe=False) self.featurizer = clone(featurizer, safe=False) self.discrete_instrument = discrete_treatment self.categories = categories self.cv = cv self.n_estimators = n_estimators self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.min_var_fraction_leaf = min_var_fraction_leaf self.min_var_leaf_on_val = min_var_leaf_on_val self.max_features = max_features self.min_impurity_decrease = min_impurity_decrease self.max_samples = max_samples self.min_balancedness_tol = min_balancedness_tol self.honest = honest self.inference = inference self.fit_intercept = fit_intercept self.subforest_size = subforest_size self.n_jobs = n_jobs self.verbose = verbose self.warm_start = warm_start self.n_crossfit_splits = n_crossfit_splits if self.n_crossfit_splits != 'raise': cv = self.n_crossfit_splits super().__init__(discrete_treatment=discrete_treatment, categories=categories, cv=cv, n_splits=n_crossfit_splits, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state)
def _gen_featurizer(self): return clone(self.featurizer, safe=False)
def __init__(self, model_Y_W, model_T_W, model_T_WZ): self._model_Y_W = clone(model_Y_W, safe=False) self._model_T_W = clone(model_T_W, safe=False) self._model_T_WZ = clone(model_T_WZ, safe=False)
def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal( _FinalWrapper(clone(self.model_final, safe=False), fit_cate_intercept=False, featurizer=clone(self.featurizer, safe=False), use_weight_trick=True))
def fit(self, X, y): """Fit Gaussian process regression model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples, [n_output_dims]) Target values Returns ------- self : returns an instance of self. """ if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") \ * RBF(1.0, length_scale_bounds="fixed") else: self.kernel_ = clone(self.kernel) self.rng = check_random_state(self.random_state) X, y = check_X_y(X, y, multi_output=True, y_numeric=True) # Normalize target value if self.normalize_y: self.y_train_mean = np.mean(y, axis=0) # demean y y = y - self.y_train_mean else: self.y_train_mean = np.zeros(1) if np.iterable(self.alpha) \ and self.alpha.shape[0] != y.shape[0]: if self.alpha.shape[0] == 1: self.alpha = self.alpha[0] else: raise ValueError( "alpha must be a scalar or an array" " with same number of entries as y.(%d != %d)" % (self.alpha.shape[0], y.shape[0])) self.X_train_ = np.copy(X) if self.copy_X_train else X self.y_train_ = np.copy(y) if self.copy_X_train else y if self.optimizer is not None and self.kernel_.n_dims > 0: # Choose hyperparameters based on maximizing the log-marginal # likelihood (potentially starting from several initial values) def obj_func(theta, eval_gradient=True): if eval_gradient: lml, grad = self.log_marginal_likelihood( theta, eval_gradient=True) return -lml, -grad else: return -self.log_marginal_likelihood(theta) # First optimize starting from theta specified in kernel optima = [(self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds))] # Additional runs are performed from log-uniform chosen initial # theta if self.n_restarts_optimizer > 0: if not np.isfinite(self.kernel_.bounds).all(): raise ValueError( "Multiple optimizer restarts (n_restarts_optimizer>0) " "requires that all bounds are finite.") bounds = self.kernel_.bounds for iteration in range(self.n_restarts_optimizer): theta_initial = \ self.rng.uniform(bounds[:, 0], bounds[:, 1]) optima.append( self._constrained_optimization(obj_func, theta_initial, bounds)) # Select result from run with minimal (negative) log-marginal # likelihood lml_values = list(map(itemgetter(1), optima)) self.kernel_.theta = optima[np.argmin(lml_values)][0] self.log_marginal_likelihood_value_ = -np.min(lml_values) else: self.log_marginal_likelihood_value_ = \ self.log_marginal_likelihood(self.kernel_.theta) # Precompute quantities required for predictions which are independent # of actual query points K = self.kernel_(self.X_train_) K[np.diag_indices_from(K)] += self.alpha self.L_ = cholesky(K, lower=True) # Line 2 self.alpha_ = cho_solve((self.L_, True), self.y_train_) # Line 3 return self
def classification(estimator, cv, X, y, groups=None, perm=None, n_jobs=1): """Do a classification. Parameters: estimator: a classifier object from sklearn cv: a cross-validation object from sklearn X: The Data, array of size n_samples x n_features y: the labels, array of size n_samples groups: optional, groups for groups based cross-validations perm: optional, None means no permutations will be computed otherwise set her the number of permutations n_jobs: optional, default: 1, number of threads to use during for the cross-validations. higher means faster. setting to -1 will use all available threads - Warning: may sow down computer. Returns: save: a dictionnary countaining: acc_score: the mean score across all cross-validations using the accuracy scoring method auc_score: the mean score across all cross-validations using the roc_auc scoring method acc: the list of all cross-validations accuracy scores auc: the list of all cross-validations roc_auc scores if permutation is not None it also countains: auc_pvalue: the pvalue using roc_auc as a scoring method acc_pvalue: the pvalue using accuracy as a scoring method auc_pscores: a list of all permutation auc scores acc_pscores: a list of all permutation accuracy scores """ y = np.asarray(y) X = np.asarray(X) if len(X) != len(y): raise ValueError("Dimension mismatch for X and y : {}, {}".format( len(X), len(y))) if groups is not None: try: if len(y) != len(groups): raise ValueError("dimension mismatch for groups and y") except TypeError: print( "Error in classification: y or", "groups is not a list or similar structure", ) exit() clf = clone(estimator) accuracies, aucs = cross_val_scores(clf, cv, X, y, groups, n_jobs) acc_score = np.mean(accuracies) auc_score = np.mean(aucs) save = { "acc_score": [acc_score], "auc_score": [auc_score], "acc": accuracies, "auc": aucs, "n_splits": cv.get_n_splits(X, y, groups), } if perm is not None: acc_pscores, auc_pscores = permutation_test(clf, cv, X, y, groups, perm, n_jobs) acc_pvalue = compute_pval(acc_score, acc_pscores) auc_pvalue = compute_pval(auc_score, auc_pscores) save.update({ "auc_pvalue": auc_pvalue, "acc_pvalue": acc_pvalue, "auc_pscores": auc_pscores, "acc_pscores": acc_pscores, }) return save
def fit(self, X, y): """Fit a receptive field model. Parameters ---------- X : array, shape (n_times[, n_epochs], n_features) The input features for the model. y : array, shape (n_times[, n_epochs][, n_outputs]) The output features for the model. Returns ------- self : instance The instance so you can chain operations. """ if self.scoring not in _SCORERS.keys(): raise ValueError('scoring must be one of %s, got' '%s ' % (sorted(_SCORERS.keys()), self.scoring)) from sklearn.base import clone X, y, _, self._y_dim = self._check_dimensions(X, y) if self.tmin > self.tmax: raise ValueError('tmin (%s) must be at most tmax (%s)' % (self.tmin, self.tmax)) # Initialize delays self.delays_ = _times_to_delays(self.tmin, self.tmax, self.sfreq) # Define the slice that we should use in the middle self.valid_samples_ = _delays_to_slice(self.delays_) if isinstance(self.estimator, numbers.Real): if self.fit_intercept is None: self.fit_intercept = True estimator = TimeDelayingRidge(self.tmin, self.tmax, self.sfreq, alpha=self.estimator, fit_intercept=self.fit_intercept, n_jobs=self.n_jobs, edge_correction=self.edge_correction) elif is_regressor(self.estimator): estimator = clone(self.estimator) if self.fit_intercept is not None and \ estimator.fit_intercept != self.fit_intercept: raise ValueError( 'Estimator fit_intercept (%s) != initialization ' 'fit_intercept (%s), initialize ReceptiveField with the ' 'same fit_intercept value or use fit_intercept=None' % (estimator.fit_intercept, self.fit_intercept)) self.fit_intercept = estimator.fit_intercept else: raise ValueError('`estimator` must be a float or an instance' ' of `BaseEstimator`,' ' got type %s.' % type(self.estimator)) self.estimator_ = estimator del estimator _check_estimator(self.estimator_) # Create input features n_times, n_epochs, n_feats = X.shape n_outputs = y.shape[-1] n_delays = len(self.delays_) # Update feature names if we have none if ((self.feature_names is not None) and (len(self.feature_names) != n_feats)): raise ValueError('n_features in X does not match feature names ' '(%s != %s)' % (n_feats, len(self.feature_names))) # Create input features X, y = self._delay_and_reshape(X, y) self.estimator_.fit(X, y) coef = get_coef(self.estimator_, 'coef_') # (n_targets, n_features) shape = [n_feats, n_delays] if self._y_dim > 1: shape.insert(0, -1) self.coef_ = coef.reshape(shape) # Inverse-transform model weights if self.patterns: if isinstance(self.estimator_, TimeDelayingRidge): cov_ = self.estimator_.cov_ / float(n_times * n_epochs - 1) y = y.reshape(-1, y.shape[-1], order='F') else: X = X - X.mean(0, keepdims=True) cov_ = np.cov(X.T) del X # Inverse output covariance if y.ndim == 2 and y.shape[1] != 1: y = y - y.mean(0, keepdims=True) inv_Y = linalg.pinv(np.cov(y.T)) else: inv_Y = 1. / float(n_times * n_epochs - 1) del y # Inverse coef according to Haufe's method # patterns has shape (n_feats * n_delays, n_outputs) coef = np.reshape(self.coef_, (n_feats * n_delays, n_outputs)) patterns = cov_.dot(coef.dot(inv_Y)) self.patterns_ = patterns.reshape(shape) return self
safe_print('SCORES') safe_print('%6s' % 'size', end=' | ') for name in sorted(names): safe_print('%s' % names[name], end=' | ') safe_print() for size in sizes: n = int(np.floor(size / 2)) X, y = make_friedman1(n_samples=size, random_state=SEED) safe_print('%6i' % n, end=' | ') for name in sorted(names): e = clone(ESTIMATORS[names[name]]) t0 = time() e.fit(X[:n], y[:n]) t1 = time() - t0 times[names[name]].append(t1) s = rmse(y[n:], e.predict(X[n:])) scores[names[name]].append(s) safe_print('%8.2f' % (s), end=' | ', flush=True) safe_print() safe_print('\nFIT TIMES') safe_print('%6s' % 'size', end=' | ')
def test_lr_scheduler_cloneable(self): # reproduces bug #271 scheduler = LRScheduler(CyclicLR, base_lr=123) clone(scheduler) # does not raise
def model_efficiency(embs, labels, model=LogisticRegression(), validation=False, reinitialize=True, **params): X_train, X_valid, y_train, y_valid = train_test_split(embs, labels, train_size=0.7, random_state=42, shuffle=True, stratify=labels) y_train, y_valid = np.array(y_train), np.array(y_valid) if issubclass(type(model), (tensorflow.python.keras.engine.sequential.Sequential, tensorflow.keras.Model)): params['validation_data'] = (X_valid, y_valid) if reinitialize: if issubclass(type(model), (tensorflow.python.keras.engine.sequential.Sequential, tensorflow.keras.Model)): model_copy = keras.models.clone_model(model) model_copy.build((None, model.input.shape)) model_copy.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy', 'accuracy']) model = model_copy else: try: model = clone(model) except Exception: print('model not reinitialized') pass model.fit(X_train, y_train, **params) if hasattr(model, 'predict_proba'): proba_predictions_train = model.predict_proba(X_train) proba_predictions_valid = model.predict_proba(X_valid) else: proba_predictions_train = model.predict(X_train) proba_predictions_valid = model.predict(X_valid) if proba_predictions_train.shape[1] == 2: predictions_train = proba_predictions_train.argmax(axis=1) predictions_valid = proba_predictions_valid.argmax(axis=1) else: predictions_train = proba_predictions_train >= 0.5 predictions_valid = proba_predictions_valid >= 0.5 loss_train = log_loss(y_train, proba_predictions_train) loss_valid = log_loss(y_valid, proba_predictions_valid) accuracy_train = accuracy_score(y_train, predictions_train) accuracy_valid = accuracy_score(y_valid, predictions_valid) f1_train = f1_score(y_train, predictions_train) f1_valid = f1_score(y_valid, predictions_valid) return {'loss':np.round(loss_train,2), 'accuracy':np.round(accuracy_train,2), 'f1':np.round(f1_train,2)}, \ {'loss':np.round(loss_valid,2), 'accuracy':np.round(accuracy_valid,2), 'f1':np.round(f1_valid,2)}
def _crossfit(model, folds, *args, **kwargs): """ General crossfit based calculation of nuisance parameters. Parameters ---------- model : object An object that supports fit and predict. Fit must accept all the args and the keyword arguments kwargs. Similarly predict must all accept all the args as arguments and kwards as keyword arguments. The fit function estimates a model of the nuisance function, based on the input data to fit. Predict evaluates the fitted nuisance function on the input data to predict. folds : list of tuples or None The crossfitting fold structure. Every entry in the list is a tuple whose first element are the training indices of the args and kwargs data and the second entry are the test indices. If the union of the test indices is not the full set of all indices, then the remaining nuisance parameters for the missing indices have value NaN. If folds is None, then cross fitting is not performed; all indices are used for both model fitting and prediction args : a sequence of (numpy matrices or None) Each matrix is a data variable whose first index corresponds to a sample kwargs : a sequence of key-value args, with values being (numpy matrices or None) Each keyword argument is of the form Var=x, with x a numpy array. Each of these arrays are data variables. The model fit and predict will be called with signature: `model.fit(*args, **kwargs)` and `model.predict(*args, **kwargs)`. Key-value arguments that have value None, are ommitted from the two calls. So all the args and the non None kwargs variables must be part of the models signature. Returns ------- nuisances : tuple of numpy matrices Each entry in the tuple is a nuisance parameter matrix. Each row i-th in the matrix corresponds to the value of the nuisance parameter for the i-th input sample. model_list : list of objects of same type as input model The cloned and fitted models for each fold. Can be used for inspection of the variability of the fitted models across folds. fitted_inds : np array1d The indices of the arrays for which the nuisance value was calculated. This corresponds to the union of the indices of the test part of each fold in the input fold list. scores : tuple of list of float or None The out-of-sample model scores for each nuisance model Examples -------- .. testcode:: import numpy as np from sklearn.model_selection import KFold from sklearn.linear_model import Lasso from econml._ortho_learner import _crossfit class Wrapper: def __init__(self, model): self._model = model def fit(self, X, y, W=None): self._model.fit(X, y) return self def predict(self, X, y, W=None): return self._model.predict(X) np.random.seed(123) X = np.random.normal(size=(5000, 3)) y = X[:, 0] + np.random.normal(size=(5000,)) folds = list(KFold(2).split(X, y)) model = Lasso(alpha=0.01) nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None) >>> nuisance (array([-1.105728... , -1.537566..., -2.451827... , ..., 1.106287..., -1.829662..., -1.782273...]),) >>> model_list [<Wrapper object at 0x...>, <Wrapper object at 0x...>] >>> fitted_inds array([ 0, 1, 2, ..., 4997, 4998, 4999]) """ model_list = [] fitted_inds = [] calculate_scores = hasattr(model, 'score') # remove None arguments kwargs = filter_none_kwargs(**kwargs) if folds is None: # skip crossfitting model_list.append(clone(model, safe=False)) model_list[0].fit(*args, **kwargs) nuisances = model_list[0].predict(*args, **kwargs) scores = model_list[0].score(*args, ** kwargs) if calculate_scores else None if not isinstance(nuisances, tuple): nuisances = (nuisances, ) if not isinstance(scores, tuple): scores = (scores, ) # scores entries should be lists of scores, so make each entry a singleton list scores = tuple([s] for s in scores) first_arr = args[0] if args else kwargs.items()[0][1] return nuisances, model_list, np.arange(first_arr.shape[0]), scores for idx, (train_idxs, test_idxs) in enumerate(folds): model_list.append(clone(model, safe=False)) if len(np.intersect1d(train_idxs, test_idxs)) > 0: raise AttributeError( "Invalid crossfitting fold structure." + "Train and test indices of each fold must be disjoint.") if len(np.intersect1d(fitted_inds, test_idxs)) > 0: raise AttributeError( "Invalid crossfitting fold structure. The same index appears in two test folds." ) fitted_inds = np.concatenate((fitted_inds, test_idxs)) args_train = tuple(var[train_idxs] if var is not None else None for var in args) args_test = tuple(var[test_idxs] if var is not None else None for var in args) kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()} kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()} model_list[idx].fit(*args_train, **kwargs_train) nuisance_temp = model_list[idx].predict(*args_test, **kwargs_test) if not isinstance(nuisance_temp, tuple): nuisance_temp = (nuisance_temp, ) if idx == 0: nuisances = tuple([ np.full((args[0].shape[0], ) + nuis.shape[1:], np.nan) for nuis in nuisance_temp ]) for it, nuis in enumerate(nuisance_temp): nuisances[it][test_idxs] = nuis if calculate_scores: score_temp = model_list[idx].score(*args_test, **kwargs_test) if not isinstance(score_temp, tuple): score_temp = (score_temp, ) if idx == 0: scores = tuple([] for _ in score_temp) for it, score in enumerate(score_temp): scores[it].append(score) return nuisances, model_list, np.sort( fitted_inds.astype(int)), (scores if calculate_scores else None)
def fit(self, run_imgs, events=None, confounds=None, design_matrices=None): """ Fit the GLM For each run: 1. create design matrix X 2. do a masker job: fMRI_data -> Y 3. fit regression to (Y, X) Parameters ---------- run_imgs: Niimg-like object or list of Niimg-like objects, See http://nilearn.github.io/manipulating_images/input_output.html#inputing-data-file-names-or-image-objects # noqa:E501 Data on which the GLM will be fitted. If this is a list, the affine is considered the same for all. events: pandas Dataframe or string or list of pandas DataFrames or strings fMRI events used to build design matrices. One events object expected per run_img. Ignored in case designs is not None. If string, then a path to a csv file is expected. confounds: pandas Dataframe or string or list of pandas DataFrames or strings Each column in a DataFrame corresponds to a confound variable to be included in the regression model of the respective run_img. The number of rows must match the number of volumes in the respective run_img. Ignored in case designs is not None. If string, then a path to a csv file is expected. design_matrices: pandas DataFrame or list of pandas DataFrames, Design matrices that will be used to fit the GLM. If given it takes precedence over events and confounds. """ # Local import to prevent circular imports from nilearn.input_data import NiftiMasker # noqa # Check arguments # Check imgs type if events is not None: _check_events_file_uses_tab_separators(events_files=events) if not isinstance(run_imgs, (list, tuple)): run_imgs = [run_imgs] if design_matrices is None: if events is None: raise ValueError('events or design matrices must be provided') if self.t_r is None: raise ValueError('t_r not given to FirstLevelModel object' ' to compute design from events') else: design_matrices = _check_run_tables(run_imgs, design_matrices, 'design_matrices') # Check that number of events and confound files match number of runs # Also check that events and confound files can be loaded as DataFrame if events is not None: events = _check_run_tables(run_imgs, events, 'events') if confounds is not None: confounds = _check_run_tables(run_imgs, confounds, 'confounds') # Learn the mask if self.mask_img is False: # We create a dummy mask to preserve functionality of api ref_img = check_niimg(run_imgs[0]) self.mask_img = Nifti1Image(np.ones(ref_img.shape[:3]), ref_img.affine) if not isinstance(self.mask_img, NiftiMasker): self.masker_ = NiftiMasker(mask_img=self.mask_img, smoothing_fwhm=self.smoothing_fwhm, target_affine=self.target_affine, standardize=self.standardize, mask_strategy='epi', t_r=self.t_r, memory=self.memory, verbose=max(0, self.verbose - 2), target_shape=self.target_shape, memory_level=self.memory_level ) self.masker_.fit(run_imgs[0]) else: if self.mask_img.mask_img_ is None and self.masker_ is None: self.masker_ = clone(self.mask_img) for param_name in ['target_affine', 'target_shape', 'smoothing_fwhm', 't_r', 'memory', 'memory_level']: our_param = getattr(self, param_name) if our_param is None: continue if getattr(self.masker_, param_name) is not None: warn('Parameter %s of the masker' ' overriden' % param_name) setattr(self.masker_, param_name, our_param) self.masker_.fit(run_imgs[0]) else: self.masker_ = self.mask_img # For each run fit the model and keep only the regression results. self.labels_, self.results_, self.design_matrices_ = [], [], [] n_runs = len(run_imgs) t0 = time.time() for run_idx, run_img in enumerate(run_imgs): # Report progress if self.verbose > 0: percent = float(run_idx) / n_runs percent = round(percent * 100, 2) dt = time.time() - t0 # We use a max to avoid a division by zero if run_idx == 0: remaining = 'go take a coffee, a big one' else: remaining = (100. - percent) / max(0.01, percent) * dt remaining = '%i seconds remaining' % remaining sys.stderr.write( "Computing run %d out of %d runs (%s)\n" % (run_idx + 1, n_runs, remaining)) # Build the experimental design for the glm run_img = check_niimg(run_img, ensure_ndim=4) if design_matrices is None: n_scans = get_data(run_img).shape[3] if confounds is not None: confounds_matrix = confounds[run_idx].values if confounds_matrix.shape[0] != n_scans: raise ValueError('Rows in confounds does not match' 'n_scans in run_img at index %d' % (run_idx,)) confounds_names = confounds[run_idx].columns.tolist() else: confounds_matrix = None confounds_names = None start_time = self.slice_time_ref * self.t_r end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r frame_times = np.linspace(start_time, end_time, n_scans) design = make_first_level_design_matrix(frame_times, events[run_idx], self.hrf_model, self.drift_model, self.high_pass, self.drift_order, self.fir_delays, confounds_matrix, confounds_names, self.min_onset ) else: design = design_matrices[run_idx] self.design_matrices_.append(design) # Mask and prepare data for GLM if self.verbose > 1: t_masking = time.time() sys.stderr.write('Starting masker computation \r') Y = self.masker_.transform(run_img) del run_img # Delete unmasked image to save memory if self.verbose > 1: t_masking = time.time() - t_masking sys.stderr.write('Masker took %d seconds \n' % t_masking) if self.signal_scaling: Y, _ = mean_scaling(Y, self.scaling_axis) if self.memory: mem_glm = self.memory.cache(run_glm, ignore=['n_jobs']) else: mem_glm = run_glm # compute GLM if self.verbose > 1: t_glm = time.time() sys.stderr.write('Performing GLM computation\r') labels, results = mem_glm(Y, design.values, noise_model=self.noise_model, bins=100, n_jobs=self.n_jobs) if self.verbose > 1: t_glm = time.time() - t_glm sys.stderr.write('GLM took %d seconds \n' % t_glm) self.labels_.append(labels) # We save memory if inspecting model details is not necessary if self.minimize_memory: for key in results: results[key] = SimpleRegressionResults(results[key]) self.results_.append(results) del Y # Report progress if self.verbose > 0: sys.stderr.write("\nComputation of %d runs done in %i seconds\n\n" % (n_runs, time.time() - t0)) return self
def test_model_clone(self): clone_clf = clone(self.clf)
def _sp_trial(trial_estimator, n_samples, n_features, cov, prec): X = _new_sample(n_samples, n_features, cov) new_estimator = clone(trial_estimator) new_estimator.fit(X) return _exact_support(prec, new_estimator.precision_)
def partial_fit(self, X, y, classes=None): """Partial fitting.""" X, y = check_X_y(X, y) if not hasattr(self, "ensemble_"): self.ensemble_ = [] self.ensemble_base_ = [] # Check feature consistency if hasattr(self, "X_"): if self.X_.shape[1] != X.shape[1]: raise ValueError("number of features does not match") self.X_, self.y_ = X, y if self.oversampled == "None": self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "ROS": ros = RandomOverSampler(random_state=42) try: self.dsel_X_, self.dsel_y_ = ros.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "B2": b2 = BorderlineSMOTE(random_state=42, kind='borderline-2') try: self.dsel_X_, self.dsel_y_ = b2.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "RUS": rus = RandomUnderSampler(random_state=42) try: self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_) # _, ys_counter = np.unique(self.dsel_y_, return_counts=True) # if np.sum(ys_counter) < 9: # rus = RandomUnderSampler(random_state=42, sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]}) # self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "CNN": cnn = CondensedNearestNeighbour(random_state=42) try: self.dsel_X_, self.dsel_y_ = cnn.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ # Check classes self.classes_ = classes if self.classes_ is None: self.classes_, _ = np.unique(y, return_inverse=True) # Append new estimator self.candidate_ = clone(self.base_estimator).fit(self.X_, self.y_) self.ensemble_.append(self.candidate_) self.ensemble_base_.extend(self.candidate_.estimators_) # Remove the worst when ensemble becomes too large if len(self.ensemble_) > self.n_estimators: self.prune_index_ = np.argmin( [self.metric(y, clf.predict(X)) for clf in self.ensemble_]) # print(self.prune_index_) del self.ensemble_[self.prune_index_] a = (((self.prune_index_ + 1) * 10) - 10) b = (((self.prune_index_ + 1) * 10)) del self.ensemble_base_[a:b] # print(a, ":", b) return self
def dml_irm_fixture(generate_data_irm, learner, score, dml_procedure, trimming_threshold): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 499 # collect data (x, y, d) = generate_data_irm # Set machine learning methods for m & g ml_g = clone(learner[1]) ml_m = clone(learner[0]) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, trimming_threshold=trimming_threshold) dml_irm_obj.fit() np.random.seed(3141) resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm( y, x, d, clone(learner[0]), clone(learner[1]), smpls, score, trimming_threshold=trimming_threshold) if dml_procedure == 'dml1': res_manual, se_manual = irm_dml1(y, x, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = irm_dml2(y, x, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) res_dict = { 'coef': dml_irm_obj.coef, 'coef_manual': res_manual, 'se': dml_irm_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score, se_manual, bootstrap, n_rep_boot) np.random.seed(3141) dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
plot_data(X, y, None, ax) ############################################################################### # Effect of clustering to over-samplers ############################################################################### ############################################################################### # Clustering based over-sampling allows to identify areas of the input space # which are appropriate to generate artificial data. Therefore, the generation # of noisy samples is avoided and the within-classes imbalanced issue is also # addressed. The next plots show the resampled data when clustering is applied, # comparing them to the resampled data of the initial over-samplers. fig, axs = plt.subplots(3, 2, figsize=(15, 15)) for (ax1, ax2), oversampler in zip(axs, OVERSAMPLERS): plot_data(X, y, clone(oversampler), ax1) plot_data(X, y, ClusterOverSampler(oversampler, KMEANS), ax2) fig.tight_layout() ############################################################################### # Performance evaluation of clustering based over-sampling ############################################################################### ############################################################################### # We are evaluating various over-samplers using F1-score as evaluation metric # on a test set. The scores with and without clustering are compared. clf = GradientBoostingClassifier(random_state=RANDOM_STATE) data = train_test_split(X, y, random_state=RANDOM_STATE) scores = pd.DataFrame() for oversampler in OVERSAMPLERS:
def fit(self, X=None, y=None): n_alpha_grid_points = 5 self.results_ = np.zeros((n_alpha_grid_points, self.n_grid_points)) self.grid_ = np.linspace(0.25, 4, self.n_grid_points) self.alphas_ = np.linspace(0.99, 0.999, n_alpha_grid_points)[::-1] self.ks_ = [] for aidx, alpha in enumerate(self.alphas_): if self.verbose: print 'at alpha {} ({}/{})'.format( alpha, aidx, n_alpha_grid_points, ) # draw a new fixed graph for alpha cov, prec = _new_graph(self.n_features, alpha) n_nonzero_prec = np.count_nonzero(prec.flat) self.ks_.append(n_nonzero_prec) if self.verbose: print ' Graph has {} nonzero entries'.format(n_nonzero_prec) for sidx, sample_grid in enumerate(self.grid_): n_samples = int(sample_grid * self.n_features) # model selection (once) X = _new_sample(n_samples, self.n_features, cov) ms_estimator = clone(self.model_selection_estimator) ms_estimator.fit(X) lam = getattr(ms_estimator, self.penalty_) if self.verbose: display_lam = lam if isinstance(lam, np.ndarray): display_lam = np.linalg.norm(lam) print ' ({}/{}), n_samples = {}, selected lambda = {}'.format( sidx, self.n_grid_points, n_samples, display_lam) # setup default trial estimator if self.trial_estimator is None: trial_estimator = QuicGraphLasso(lam=lam, mode='default', init_method='corrcoef') else: trial_estimator = self.trial_estimator # patch trial estimator with this lambda trial_estimator.set_params(**{ self.penalty: lam, }) # estimate statistical power exact_support_counts = Parallel( n_jobs=self.n_jobs, verbose=False, backend='threading', #max_nbytes=None, #batch_size=1, )( delayed(_sp_trial)( trial_estimator, n_samples, self.n_features, cov, prec ) for nn in range(self.n_trials)) self.results_[aidx, sidx] = 1. * np.sum(exact_support_counts) / self.n_trials if self.verbose: print 'Results at this row: {}'.format(self.results_[aidx, :]) self.is_fitted = True return self
y_train_5 = (y_train == 5) y_test_5 = (y_test == 5) #Binary classifier = stochastic gradient descent SGDClassifier from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(X_train,y_train_5) sgd_clf.predict([some_digit]) #Cross Validation skfolds = StratifiedKFold(n_splits=3, random_state=42) for train_index, test_index in skfolds.split(X_train, y_train_5): clone_clf = clone(sgd_clf) X_train_folds = X_train[train_index] y_train_folds = y_train_5[train_index] X_test_fold = X_train[test_index] y_test_fold = y_train_5[test_index] clone_clf.fit(X_train_folds, y_train_folds) y_pred = clone_clf.predict(X_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred)) #Determine cross validation score - 3 folds cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy") #Prediction set selection from cross val y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) #Confusion matrix from prior variable set
def balanced_batch_generator(X, y, sample_weight=None, sampler=None, batch_size=32, keep_sparse=False, random_state=None): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which is given to ``fit_generator``. The sampler defines the sampling strategy used to balance the dataset ahead of creating the batch. The sampler should have an attribute ``return_indices``. Parameters ---------- X : ndarray, shape (n_samples, n_features) Original imbalanced dataset. y : ndarray, shape (n_samples,) or (n_samples, n_classes) Associated targets. sample_weight : ndarray, shape (n_samples,) Sample weight. sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``return_indices``. By default, the sampler used is a :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. keep_sparse : bool, optional (default=False) Either or not to conserve or not the sparsity of the input ``X``. By default, the returned batches will be dense. {random_state} Returns ------- generator : generator of tuple Generate batch of data. The tuple generated are either (X_batch, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int The number of samples per epoch. Examples -------- >>> import numpy as np >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> from imblearn.datasets import make_imbalance >>> X, y = make_imbalance(X, y, class_dict) >>> X = X.astype(np.float32) >>> batch_size, learning_rate, epochs = 10, 0.01, 10 >>> training_generator, steps_per_epoch = balanced_batch_generator( ... X, y, sample_weight=None, sampler=None, ... batch_size=batch_size, random_state=42) >>> input_size, output_size = X.shape[1], 3 >>> import tensorflow as tf >>> def init_weights(shape): ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) >>> def accuracy(y_true, y_pred): ... return np.mean(np.argmax(y_pred, axis=1) == y_true) >>> # input and output >>> data = tf.placeholder("float32", shape=[None, input_size]) >>> targets = tf.placeholder("int32", shape=[None]) >>> # build the model and weights >>> W = init_weights([input_size, output_size]) >>> b = init_weights([output_size]) >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) >>> # build the loss, predict, and train operator >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( ... logits=out_act, labels=targets) >>> loss = tf.reduce_sum(cross_entropy) >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) >>> train_op = optimizer.minimize(loss) >>> predict = tf.nn.softmax(out_act) >>> # Initialization of all variables in the graph >>> init = tf.global_variables_initializer() >>> with tf.Session() as sess: ... print('Starting training') ... sess.run(init) ... for e in range(epochs): ... for i in range(steps_per_epoch): ... X_batch, y_batch = next(training_generator) ... feed_dict = dict() ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch ... sess.run([train_op, loss], feed_dict=feed_dict) ... # For each epoch, run accuracy on train and test ... feed_dict = dict() ... feed_dict[data] = X ... predicts_train = sess.run(predict, feed_dict=feed_dict) ... print("epoch: {{}} train accuracy: {{:.3f}}" ... .format(e, accuracy(y, predicts_train))) ... # doctest: +ELLIPSIS Starting training [... """ random_state = check_random_state(random_state) if sampler is None: sampler_ = RandomUnderSampler(return_indices=True, random_state=random_state) else: if not hasattr(sampler, 'return_indices'): raise ValueError("'sampler' needs to return the indices of " "the samples selected. Provide a sampler " "which has an attribute 'return_indices'.") sampler_ = clone(sampler) sampler_.set_params(return_indices=True) set_random_state(sampler_, random_state) _, _, indices = sampler_.fit_sample(X, y) # shuffle the indices since the sampler are packing them by class random_state.shuffle(indices) def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): X_res = safe_indexing(X, indices[index:index + batch_size]) y_res = safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = safe_indexing(sample_weight, indices[index:index + batch_size]) yield X_res, y_res, sw_res return (generator(X, y, sample_weight, indices, batch_size), int(indices.size // batch_size))
def test_clone(self, lung_X, lung_y): rfs = RangerForestSurvival(n_estimators=N_ESTIMATORS) rfs.fit(lung_X, lung_y) clone(rfs)
warm_start=True, penalty=None, learning_rate="constant", eta0=0.0005) minimum_val_error = float("inf") best_epoch = None best_model = None for epoch in range(1000): sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off y_val_predict = sgd_reg.predict(X_val_poly_scaled) val_error = mean_squared_error(y_val, y_val_predict) if val_error < minimum_val_error: minimum_val_error = val_error best_epoch = epoch best_model = clone(sgd_reg) #%% Logistic Regression from sklearn import datasets iris = datasets.load_iris() list(iris.keys()) X = iris["data"][:, 3:] # petal width y = (iris["target"] == 2).astype(np.int) # 1 if Iris virginica, else 0 from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression() log_reg.fit(X, y) X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
def train_model(regressor, X, y): regressor_ = clone(regressor) regressor_.fit(X, y) return regressor_