Exemplo n.º 1
0
Arquivo: cc.py Projeto: ibab/carl
    def _fit_X_y(self, X_clf, y_clf, X_cal, y_cal):
        clf = clone(self.base_estimator)

        if isinstance(clf, RegressorMixin):
            clf = as_classifier(clf)

        clf.fit(X_clf, y_clf)

        if self.calibration is None:
            return clf, None, None

        else:
            if self.calibration == "kde":
                cal_num = KernelDensity()
                cal_den = KernelDensity()

            elif self.calibration == "histogram":
                cal_num = Histogram(bins=100, range=[(0.0, 1.0)])
                cal_den = Histogram(bins=100, range=[(0.0, 1.0)])

            else:
                cal_num = clone(self.calibration)
                cal_den = clone(self.calibration)

            X_num = clf.predict_proba(X_cal[y_cal == 0])[:, 0]
            X_den = clf.predict_proba(X_cal[y_cal == 1])[:, 0]
            cal_num.fit(X_num.reshape(-1, 1))
            cal_den.fit(X_den.reshape(-1, 1))

            return clf, cal_num, cal_den
Exemplo n.º 2
0
def pool_entropy_h(X, y, candidate_mask, train_mask, classifier, n_candidates,
                   pool_n, n_jobs=-1, **kwargs):
    """ Return the candidate that will minimise the expected entropy of the predictions.

        Parameters
        ----------
        X_training_candidates : array
            The feature matrix of the potential training candidates.

        classes : int
            The name of classes.

        pool_n : int
            The size of the sampel pool used in estimating the entropy

        n_jobs : int
            The number of parallel jobs (-1 if want to use all cores)

        Returns
        -------
        best_candidate : int
            The index of the best candidate.
    """
    
    classes = classifier.classes_ # sorted lexicographically
    n_classes = len(classes)
    candidate_size = np.sum(train_mask)
    n_features = X.shape[1]
    entropy = np.empty(len(candidate_mask))
    entropy[:] = np.inf

    # the probabilities used to calculate expected value of pool
    probs = classifier.predict_proba(X[candidate_mask])

    # copy the classifier (avoid modifying the original classifier)
    classifier_plus = clone(classifier)

    # construct the sample pool (used to estimate the entropy)
    unlabelled_indices = np.where(-train_mask)[0]
    pool_indices = permutation(unlabelled_indices)[:pool_n]
    pool_mask = np.zeros(len(candidate_mask), dtype=bool)
    pool_mask[pool_indices] = True

    # let's look at each candidate
    candidate_indices = np.where(candidate_mask)[0]

    results = Parallel(n_jobs=n_jobs)(delayed(_parallel_entropy_estimate)(
        X, y.copy(), train_mask.copy(), pool_mask,
        clone(classifier_plus), classes, n_classes, probs, i, index)
        for i, index in enumerate(candidate_indices))

    indices, expected = zip(*results)
    indices, expected = np.asarray(indices), np.asarray(expected)
    assert not np.isnan(expected).any(), 'Some expected values are undefined.'

    entropy[indices] = expected

    # pick the candidate with the smallest expected entropy
    best_candidates = np.argsort(entropy)[:n_candidates]
    return best_candidates
def nn_embedding_translate(words=en_2_es.keys(), embedding1=en_embedding, embedding2=es_embedding,
                           constraint=es_2_en.keys(), k=5,
                           pre_transform=None, log=False):
    if pre_transform is not None:
        pre_transform_1 = clone(pre_transform)
        pre_transform_2 = clone(pre_transform)
        embedding1 = transform(embedding1, pre_transform_1)
        embedding2 = transform(embedding2, pre_transform_2)

    if constraint is not None:
        embedding2 = sub_embedding(embedding2, constraint)

    in_vocab_words = [word for word in words if embedding1.normalize(word) is not None]
    if log:
        print "{} of {} words in vocab".format(len(in_vocab_words), len(words))

    output = {}
    for i, word in enumerate(in_vocab_words):
        if log and i % 100 == 0:
            print "{} of {} words".format(i, len(words))

        emb = embedding1.word_to_embedding(word)
        if emb is not None:
            trans = embedding2.words_closest_to_point(emb, k=k)
            trans = softmax(trans)
            output[word] = trans
    return output
Exemplo n.º 4
0
    def fit(self, X):
        param_grid = list(ParameterGrid(self.param_grid))
        n_folds = len(self.cv)
        n_grid = len(param_grid)

        scores = np.zeros((n_folds, n_grid), dtype=np.float64)

        for i, (X_tr, X_te) in enumerate(self.cv.split(X)):
            for j, params in enumerate(param_grid):
                estimator = clone(self.estimator)
                estimator.set_params(**params)
                estimator.fit(X_tr)

                scores[i, j] = estimator.score(X_te)

        # FIXME: handle higher is better as well.
        best = scores.mean(axis=0).argmin()
        self.best_params_ = param_grid[best]

        # Refit
        if self.refit:
            self.best_estimator_ = clone(self.estimator)
            self.best_estimator_.set_params(**self.best_params_)
            self.best_estimator_.fit(X)

        return self
Exemplo n.º 5
0
def test_missing_value_handling(est, func, support_sparse):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
    assert not np.all(np.isnan(X_train), axis=0).any()
    assert np.any(np.isnan(X_train), axis=0).all()
    assert np.any(np.isnan(X_test), axis=0).all()
    X_test[:, 0] = np.nan  # make sure this boundary case is tested

    Xt = est.fit(X_train).transform(X_test)
    # missing values should still be missing, and only them
    assert_array_equal(np.isnan(Xt), np.isnan(X_test))

    # check that the function leads to the same results as the class
    Xt_class = est.transform(X_train)
    Xt_func = func(X_train, **est.get_params())
    assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
    assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])

    # check that the inverse transform keep NaN
    Xt_inv = est.inverse_transform(Xt)
    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
    # FIXME: we can introduce equal_nan=True in recent version of numpy.
    # For the moment which just check that non-NaN values are almost equal.
    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])

    for i in range(X.shape[1]):
        # train only on non-NaN
        est.fit(_get_valid_samples_by_column(X_train, i))
        # check transforming with NaN works even when training without NaN
        Xt_col = est.transform(X_test[:, [i]])
        assert_array_equal(Xt_col, Xt[:, [i]])
        # check non-NaN is handled as before - the 1st column is all nan
        if not np.isnan(X_test[:, i]).all():
            Xt_col_nonan = est.transform(
                _get_valid_samples_by_column(X_test, i))
            assert_array_equal(Xt_col_nonan,
                               Xt_col[~np.isnan(Xt_col.squeeze())])

    if support_sparse:
        est_dense = clone(est)
        est_sparse = clone(est)

        Xt_dense = est_dense.fit(X_train).transform(X_test)
        Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
        for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
                                   sparse.bsr_matrix, sparse.coo_matrix,
                                   sparse.dia_matrix, sparse.dok_matrix,
                                   sparse.lil_matrix):
            # check that the dense and sparse inputs lead to the same results
            Xt_sparse = (est_sparse.fit(sparse_constructor(X_train))
                         .transform(sparse_constructor(X_test)))
            assert_allclose(Xt_sparse.A, Xt_dense)
            Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse)
            assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
def test_sklearn_clone():
    tm._skip_if_no_sklearn()
    from sklearn.base import clone

    clf = xgb.XGBClassifier(n_jobs=2, nthread=3)
    clf.n_jobs = -1
    clone(clf)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, StrMeasure:str, std:bool = False, N:int = 0):
	Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int)
	Data = Data[:, np.newaxis]

	for f in FeaUsed:
		T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float))
		if len(T.shape) < 2:
			T = T[:, np.newaxis]
		Data = np.concatenate((Data, T), axis = 1)

	if N > 0:
		Data = Data[:N, :]

	Lbl = Data[:, 0]
	Fea = Data[:,1:]
	if std:
		scaler = preprocessing.StandardScaler()
		Fea = scaler.fit_transform(Fea)

	Model = base.clone(Models[StrModel])
	SetParam(Model, Param)

	Model.fit(Fea, Lbl)
	Pred = Model.predict(Fea)
	st = Measures[StrMeasure](Lbl, Pred)
		
	sv = cross_validation.cross_val_score(base.clone(Models[StrModel]), Fea, Lbl, metrics.make_scorer(Measures[StrMeasure]), cv = 5, n_jobs = 5)

	return st, np.mean(sv)
Exemplo n.º 8
0
def run_classifier(out_folder, trend_probs, referrers, y, train, test):

    F = referrers #static features
    etree = create_grid_search('lr', n_jobs = 1)
    
    y_pred = trend_probs[test].argmax(axis=1)
    save_results(out_folder, 'tl-base-lr', y_pred, y[test])

    aux = clone(etree)
    aux.fit(F[train], y[train])
    y_pred = aux.predict(F[test])
    save_results(out_folder, 'tree-feats', y_pred, y[test])
    
    aux = clone(etree)
    aux.fit(trend_probs[train], y[train])
    y_pred = aux.predict(trend_probs[test])
    save_results(out_folder, 'tree-probs', y_pred, y[test])
    
    C = np.hstack((F, trend_probs))
    aux = clone(etree)
    aux.fit(C[train], y[train])
    y_pred = aux.predict(C[test])
    save_results(out_folder, 'meta-combine', y_pred, y[test])

    #stack_clf = stacking.Stacking(3, [etree], 'tree')
    #stack_clf.fit(F[train], y[train], trend_probs[train])
    #y_pred = stack_clf.predict(F[test], trend_probs[test])
    #save_results(out_folder, 'meta-stack-tree', y_pred)
    
    stack_clf = stacking.Stacking(3, [etree], 'linear')
    stack_clf.fit(F[train], y[train], trend_probs[train])
    y_pred = stack_clf.predict(F[test], trend_probs[test])
    save_results(out_folder, 'meta-stack-linear', y_pred, y[test])
    def _validate_estimator(self):
        "Private function to validate SMOTE and ENN objects"
        if self.smote is not None:
            if isinstance(self.smote, SMOTE):
                self.smote_ = clone(self.smote)
            else:
                raise ValueError('smote needs to be a SMOTE object.'
                                 'Got {} instead.'.format(type(self.smote)))
        # Otherwise create a default SMOTE
        else:
            self.smote_ = SMOTE(
                sampling_strategy=self.sampling_strategy,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                ratio=self.ratio)

        if self.enn is not None:
            if isinstance(self.enn, EditedNearestNeighbours):
                self.enn_ = clone(self.enn)
            else:
                raise ValueError('enn needs to be an EditedNearestNeighbours.'
                                 ' Got {} instead.'.format(type(self.enn)))
        # Otherwise create a default EditedNearestNeighbours
        else:
            self.enn_ = EditedNearestNeighbours(
                            sampling_strategy='all',
                            n_jobs=self.n_jobs)
Exemplo n.º 10
0
    def _fit_calibrators(self, df0, df1):
        df0 = df0.reshape(-1, 1)
        df1 = df1.reshape(-1, 1)

        if self.method == "kde":
            calibrator0 = KernelDensity()
            calibrator1 = KernelDensity()

        elif self.method == "histogram":
            eps = 0.05
            df_min = max(0, min(np.min(df0), np.min(df1)) - eps)
            df_max = min(1, max(np.max(df0), np.max(df1)) + eps)

            calibrator0 = Histogram(bins=10 + int(len(df0) ** (1. / 3.)),
                                    range=[(df_min, df_max)],
                                    interpolation="linear")
            calibrator1 = Histogram(bins=10 + int(len(df0) ** (1. / 3.)),
                                    range=[(df_min, df_max)],
                                    interpolation="linear")

        else:
            calibrator0 = clone(self.method)
            calibrator1 = clone(self.method)

        calibrator0.fit(df0)
        calibrator1.fit(df1)

        return calibrator0, calibrator1
Exemplo n.º 11
0
    def train(self,
              training_trackers,  # type: List[DialogueStateTracker]
              domain,  # type: Domain
              **kwargs  # type: **Any
              ):
        # type: (...) -> Dict[Text: Any]

        training_data = self.featurize_for_training(training_trackers,
                                                    domain,
                                                    **kwargs)

        X, y = self._extract_training_data(training_data)
        model = self.model_architecture(**kwargs)
        score = None
        # Note: clone is called throughout to avoid mutating default
        # arguments.
        self.label_encoder = clone(self.label_encoder).fit(y)
        Xt, yt = self._preprocess_data(X, y)

        if self.cv is None:
            model = clone(model).fit(Xt, yt)
        else:
            param_grid = self.param_grid or {}
            model, score = self._search_and_score(
                model, Xt, yt, param_grid)

        self.model = model
        logger.info("Done fitting sklearn policy model")
        if score is not None:
            logger.info("Cross validation score: {:.5f}".format(score))
Exemplo n.º 12
0
def test_kernel_clone_after_set_params():
    # This test is to verify that using set_params does not
    # break clone on kernels.
    # This used to break because in kernels such as the RBF, non-trivial
    # logic that modified the length scale used to be in the constructor
    # See https://github.com/scikit-learn/scikit-learn/issues/6961
    # for more details.
    bounds = (1e-5, 1e5)
    for kernel in kernels:
        kernel_cloned = clone(kernel)
        params = kernel.get_params()
        # RationalQuadratic kernel is isotropic.
        isotropic_kernels = (ExpSineSquared, RationalQuadratic)
        if 'length_scale' in params and not isinstance(kernel,
                                                       isotropic_kernels):
            length_scale = params['length_scale']
            if np.iterable(length_scale):
                params['length_scale'] = length_scale[0]
                params['length_scale_bounds'] = bounds
            else:
                params['length_scale'] = [length_scale] * 2
                params['length_scale_bounds'] = bounds * 2
            kernel_cloned.set_params(**params)
            kernel_cloned_clone = clone(kernel_cloned)
            assert_equal(kernel_cloned_clone.get_params(),
                         kernel_cloned.get_params())
            assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned))
            yield (check_hyperparameters_equal, kernel_cloned,
                   kernel_cloned_clone)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, std:bool = False, N:int = 0):
	Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int)
	Data = Data[:, np.newaxis]

	for f in FeaUsed:
		T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float))
		if len(T.shape) < 2:
			T = T[:, np.newaxis]
		Data = np.concatenate((Data, T), axis = 1)
	
	if N > 0:
		Data = Data[:N, :]

	Lbl = Data[:, 0]
	Fea = Data[:,1:]
	if std:
		scaler = preprocessing.StandardScaler()
		Fea = scaler.fit_transform(Fea)

	Model = base.clone(Models[StrModel])
	SetParam(Model, Param)

	Model.fit(Fea, Lbl)
	Pred = Model.predict_proba(Fea)[:, 1]
	st = metrics.precision_recall_curve(Lbl, Pred)
		
	Folds = cross_validation.KFold(Fea.shape[0], n_folds = 5)
	for train, valid in Folds:
		Model = base.clone(Models[StrModel])
		SetParam(Model, Param)
		Model.fit(Fea[train], Lbl[train])
		Pred[valid] = Model.predict_proba(Fea[valid])[:, 1]
	
	sv = metrics.precision_recall_curve(Lbl, Pred)
	return st, sv
Exemplo n.º 14
0
    def _validate_estimator(self):
        "Private function to validate SMOTE and ENN objects"

        if self.smote is not None:
            if isinstance(self.smote, SMOTE):
                self.smote_ = clone(self.smote)
            else:
                raise ValueError('smote needs to be a SMOTE object.'
                                 'Got {} instead.'.format(type(self.smote)))
        # Otherwise create a default SMOTE
        else:
            self.smote_ = SMOTE(
                sampling_strategy=self.sampling_strategy,
                random_state=self.random_state,
                ratio=self.ratio)

        if self.tomek is not None:
            if isinstance(self.tomek, TomekLinks):
                self.tomek_ = clone(self.tomek)
            else:
                raise ValueError('tomek needs to be a TomekLinks object.'
                                 'Got {} instead.'.format(type(self.tomek)))
        # Otherwise create a default TomekLinks
        else:
            self.tomek_ = TomekLinks(sampling_strategy='all')
Exemplo n.º 15
0
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
Exemplo n.º 16
0
def make_classifiers(method, balanced, labels, selectors=None, columns=None, random_state=None):
    estimators = {}
    class_weight = None
    if balanced:
        class_weight = 'balanced'

    # Make appropriate delegatation
    if 'lr' in method:
        estimator = LogisticRegression(n_jobs=1)
    elif 'svm' in method:
        estimator = SVC(probability=False)
    elif 'rf' in method:
        estimator = RandomForestClassifier(n_jobs=1)
    else:
        raise ValueError("Not implemented for method {}".format(method))

    estimator = estimator.set_params(**{'class_weight': class_weight, 'random_state': random_state})
    if hasattr(estimator, 'n_jobs'):
        estimator.set_params(**{'n_jobs': 1})

    if 'bagged' in method:
        for l in labels:
            named_estimators = zip(columns, [clone(estimator) for _ in columns])
            weights = [1] * len(columns)
            estimators[l] = HybridFeatureVotingClassifier(
                named_estimators, selectors, voting='soft', weights=weights, n_jobs=4
            )
    else:
        for l in labels:
            estimators[l] = clone(estimator)
    return estimators
 def fit(self, X, y):
     """Fit the shape function of each features with the backfitting algorithm.
     Please note that the shape functions are centered (not reduced).
     
     Parameters
     ----------
     X : array-like, shape=(n_samples, n_features)
         The input samples. 
         
     Returns
     -------
     self : object
         The Generalized Additive Model with the fitted shape functions
     """
     
     n_samples, n_features = X.shape
     
     if not isinstance(self.smoothers, list):
         self.smoothers_ = [clone(self.smoothers) for i in range(n_features) ]
         self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False)
     else:
         self.smoothers_ = [clone(self.smoothers[j]) for j in range(n_features) ]
         self.ridge = RidgeCV(alphas = [self.ridge_alphas]*len(self.smoothers_), fit_intercept=False)
         
     self.y_mean_ = np.mean(y)
     self.rmse_ = [] # array to stock the train error over the iteration
     y -= y.mean()
     temp = np.zeros(shape=(n_samples, n_features)) # array to stock the shape function for re-use in the next iteration
     shape_functions = np.zeros(shape=(n_samples, n_features))
     for i in range(self.max_iter):
         for j in range(n_features):
             # select all the columns except the j-th one
             idx = list(set(np.arange(0, n_features, 1)) - set([j])) 
             
             #Compute the residuals of the previous iteration          
             residuals = y.reshape((n_samples,1)) - temp[:, idx].sum(axis=1, keepdims=True).reshape((n_samples, 1)) 
             residuals -=residuals.mean()
             residuals = residuals
             #print(np.amin(residuals), np.amax(residuals), 'iteration number %s'%(i+1))
            
             self.smoothers_[j].fit(X[:, j:j+1], residuals.reshape((n_samples,))) #reshape cause deprecation warning
             shape_functions[:, j]= self.smoothers_[j].predict(X[:, j:j+1])
             shape_functions[:, j] -= shape_functions[:, j].mean()
         
         # RidgeRegression on top of the shape function in order to 're-scale' each shape functions
         self.ridge.fit(shape_functions, y)
         coef = self.ridge.coef_
         shape_functions *= coef
         
         y_pred = shape_functions.sum(axis=1)
         y_pred -= y_pred.mean()
         self.rmse_.append(met.mean_squared_error(y_pred, y))
         
         temp=shape_functions.copy()
         #plt.scatter(1, np.abs(residuals.min()), c='g', label='iteration = %s'%i)
         #plt.scatter(2, np.abs(residuals.max()), c='r')
         #plt.legend()
         #plt.show()
     return self
Exemplo n.º 18
0
def pool_variance_h(X, y, candidate_mask, train_mask, classifier, n_candidates,
                   pool_n, C, n_jobs=-1, random_state=None, **kwargs):
    """ Return the candidate that will minimise the expected variance of the predictions.

        Parameters
        ----------
        X_training_candidates : array
            The feature matrix of the potential training candidates.

        C : float
            The regularisation parameter of Logistic Regression.

        pool_sample_size : int
            The size of the sample which will be used to estimate the variance/entropy.

        n_jobs : int
            The number of parallel jobs (-1 if want to use all cores)

        Returns
        -------
        best_candidate : int
            The index of the best candidate.
    """
    
    classes = classifier.classes_ # sorted lexicographically
    n_classes = len(classes)
    n_features = X.shape[1]
    variance = np.empty(len(candidate_mask))
    variance[:] = np.inf
    rng = RandomState(random_state)

    # the probabilities used to calculate expected value of pool
    probs = classifier.predict_proba(X[candidate_mask])

    # copy the classifier (avoid modifying the original classifier)
    classifier_plus = clone(classifier)

    # construct the sample pool (used to estimate the variance)
    unlabelled_indices = np.where(-train_mask)[0]
    pool_indices = rng.permutation(unlabelled_indices)[:pool_n]
    pool_mask = np.zeros(len(candidate_mask), dtype=bool)
    pool_mask[pool_indices] = True

    # let's look at each candidate
    candidate_indices = np.where(candidate_mask)[0]

    results = Parallel(n_jobs=n_jobs)(delayed(_parallel_variance_estimate)(
        X, y.copy(), train_mask.copy(), pool_mask,
        clone(classifier_plus), classes, n_classes, probs, i, index, C)
        for i, index in enumerate(candidate_indices))

    indices, expected = zip(*results)
    indices, expected = np.asarray(indices), np.asarray(expected)
    assert not np.isnan(expected).any(), 'Some expected values are undefined.'
    variance[indices] = expected

    # pick the candidate with the smallest expected variance
    best_candidates = np.argsort(variance)[:n_candidates]
    return best_candidates
Exemplo n.º 19
0
def test_all_estimators():
    # Test that estimators are default-constructible, clonable
    # and have working repr.
    estimators = all_estimators(include_meta_estimators=True)
    classifier = LDA()

    for name, Estimator in estimators:
        # some can just not be sensibly default constructed
        if name in dont_test:
            continue
        # test default-constructibility
        # get rid of deprecation warnings
        with warnings.catch_warnings(record=True):
            if name in meta_estimators:
                estimator = Estimator(classifier)
            else:
                estimator = Estimator()
            # test cloning
            clone(estimator)
            # test __repr__
            repr(estimator)
            # test that set_params returns self
            assert_true(isinstance(estimator.set_params(), Estimator))

            # test if init does nothing but set parameters
            # this is important for grid_search etc.
            # We get the default parameters from init and then
            # compare these against the actual values of the attributes.

            # this comes from getattr. Gets rid of deprecation decorator.
            init = getattr(estimator.__init__, 'deprecated_original',
                           estimator.__init__)
            try:
                args, varargs, kws, defaults = inspect.getargspec(init)
            except TypeError:
                # init is not a python function.
                # true for mixins
                continue
            params = estimator.get_params()
            if name in meta_estimators:
                # they need a non-default argument
                args = args[2:]
            else:
                args = args[1:]
            if args:
                # non-empty list
                assert_equal(len(args), len(defaults))
            else:
                continue
            for arg, default in zip(args, defaults):
                if arg not in params.keys():
                    # deprecated parameter, not in get_params
                    assert_true(default is None)
                    continue

                if isinstance(params[arg], np.ndarray):
                    assert_array_equal(params[arg], default)
                else:
                    assert_equal(params[arg], default)
def test_clone():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf,
                                 store_train_meta_features=True)
    clone(stregr)
Exemplo n.º 21
0
def test_clone():
    knn = KNeighborsClassifier()
    lr = LogisticRegression(multi_class='ovr', solver='liblinear')
    gnb = GaussianNB()
    stclf = StackingCVClassifier(classifiers=[knn, gnb],
                                 meta_classifier=lr,
                                 store_train_meta_features=True)
    clone(stclf)
def test_clone():

    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    gnb = GaussianNB()
    stclf = StackingClassifier(classifiers=[knn, gnb],
                               meta_classifier=lr,
                               store_train_meta_features=True)
    clone(stclf)
Exemplo n.º 23
0
def test_clone_empty_array():
    # Regression test for cloning estimators with empty arrays
    clf = MyEstimator(empty=np.array([]))
    clf2 = clone(clf)
    assert_array_equal(clf.empty, clf2.empty)

    clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
    clf2 = clone(clf)
    assert_array_equal(clf.empty.data, clf2.empty.data)
def test_clone():

    mlp = MLP(epochs=5,
              eta=0.05,
              hidden_layers=[10],
              minibatches=len(y),
              random_seed=1)

    clone(mlp)
def test_clone():

    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)
    clone(eclf)
Exemplo n.º 26
0
def test_clone():
    from sklearn.base import clone

    a = mcmc.FMRegression()
    b = clone(a)
    assert a.get_params() == b.get_params()

    a = mcmc.FMClassification()
    b = clone(a)
    assert a.get_params() == b.get_params()
Exemplo n.º 27
0
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Exemplo n.º 28
0
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Exemplo n.º 29
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Exemplo n.º 30
0
def check_parameters_default_constructible(name, Estimator):
    classifier = LDA()
    # test default-constructibility
    # get rid of deprecation warnings
    with warnings.catch_warnings(record=True):
        if name in META_ESTIMATORS:
            estimator = Estimator(classifier)
        else:
            estimator = Estimator()
        # test cloning
        clone(estimator)
        # test __repr__
        repr(estimator)
        # test that set_params returns self
        assert_true(estimator.set_params() is estimator)

        # test if init does nothing but set parameters
        # this is important for grid_search etc.
        # We get the default parameters from init and then
        # compare these against the actual values of the attributes.

        # this comes from getattr. Gets rid of deprecation decorator.
        init = getattr(estimator.__init__, 'deprecated_original',
                       estimator.__init__)
        try:
            args, varargs, kws, defaults = inspect.getargspec(init)
        except TypeError:
            # init is not a python function.
            # true for mixins
            return
        params = estimator.get_params()
        if name in META_ESTIMATORS:
            # they need a non-default argument
            args = args[2:]
        else:
            args = args[1:]
        if args:
            # non-empty list
            assert_equal(len(args), len(defaults))
        else:
            return
        for arg, default in zip(args, defaults):
            assert_in(type(default), [str, int, float, bool, tuple, type(None),
                                      np.float64, types.FunctionType, Memory])
            if arg not in params.keys():
                # deprecated parameter, not in get_params
                assert_true(default is None)
                continue

            if isinstance(params[arg], np.ndarray):
                assert_array_equal(params[arg], default)
            else:
                assert_equal(params[arg], default)
Exemplo n.º 31
0
def plot_silhouette(clf,
                    X,
                    title='Silhouette Analysis',
                    metric='euclidean',
                    copy=True,
                    ax=None,
                    figsize=None,
                    title_fontsize="large",
                    text_fontsize="medium"):
    """Plots silhouette analysis of clusters using fit_predict.

    Args:
        clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods.

        X (array-like, shape (n_samples, n_features)):
            Data to cluster, where n_samples is the number of samples and
            n_features is the number of features.

        title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis"

        metric (string or callable, optional): The metric to use when calculating distance
            between instances in a feature array. If metric is a string, it must be one of
            the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is
            the distance array itself, use "precomputed" as the metric.

        copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a
            copy of **clf**.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

        figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6).
            Defaults to ``None``.

        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "large".

        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium".

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> kmeans = KMeans(n_clusters=4, random_state=1)
        >>> skplt.plot_silhouette(kmeans, X)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_silhouette.png
           :align: center
           :alt: Silhouette Plot
    """
    if copy:
        clf = clone(clf)

    cluster_labels = clf.fit_predict(X)

    n_clusters = len(set(cluster_labels))

    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)

    sample_silhouette_values = silhouette_samples(X,
                                                  cluster_labels,
                                                  metric=metric)

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)

    ax.set_title(title, fontsize=title_fontsize)
    ax.set_xlim([-0.1, 1])

    ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10])

    ax.set_xlabel('Silhouette coefficient values', fontsize=text_fontsize)
    ax.set_ylabel('Cluster label', fontsize=text_fontsize)

    y_lower = 10

    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels
                                                                 == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)

        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0,
                         ith_cluster_silhouette_values,
                         facecolor=color,
                         edgecolor=color,
                         alpha=0.7)

        ax.text(-0.05,
                y_lower + 0.5 * size_cluster_i,
                str(i),
                fontsize=text_fontsize)

        y_lower = y_upper + 10

    ax.axvline(x=silhouette_avg,
               color="red",
               linestyle="--",
               label='Silhouette score: {0:0.3f}'.format(silhouette_avg))

    ax.set_yticks([])  # Clear the y-axis labels / ticks
    ax.set_xticks(np.arange(-0.1, 1.0, 0.2))

    ax.tick_params(labelsize=text_fontsize)
    ax.legend(loc='best', fontsize=text_fontsize)

    return ax
Exemplo n.º 32
0
def plot_elbow_curve(clf,
                     X,
                     title='Elbow Plot',
                     cluster_ranges=None,
                     ax=None,
                     figsize=None,
                     title_fontsize="large",
                     text_fontsize="medium"):
    """Plots elbow curve of different values of K for KMeans clustering.

    Args:
        clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods and an
            ``n_clusters`` parameter.

        X (array-like, shape (n_samples, n_features)):
            Data to cluster, where n_samples is the number of samples and
            n_features is the number of features.

        title (string, optional): Title of the generated plot. Defaults to "Elbow Plot"

        cluster_ranges (None or :obj:`list` of int, optional): List of n_clusters for which
            to plot the explained variances. Defaults to ``range(1, 12, 2)``.

        copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a
            copy of **clf**.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

        figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6).
            Defaults to ``None``.

        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "large".

        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium".

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> kmeans = KMeans(random_state=1)
        >>> skplt.plot_elbow_curve(kmeans, cluster_ranges=range(1, 11))
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_elbow_curve.png
           :align: center
           :alt: Elbow Curve
    """
    if cluster_ranges is None:
        cluster_ranges = range(1, 12, 2)
    else:
        cluster_ranges = sorted(cluster_ranges)

    if not hasattr(clf, 'n_clusters'):
        raise TypeError('"n_clusters" attribute not in classifier. '
                        'Cannot plot elbow method.')

    clfs = []
    for i in cluster_ranges:
        current_clf = clone(clf)
        setattr(current_clf, "n_clusters", i)
        clfs.append(current_clf.fit(X))

    centroids = [k.cluster_centers_ for k in clfs]

    D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
    dist = [np.min(D, axis=1) for D in D_k]
    # avgWithinSS = [np.sum(d)/X.shape[0] for d in dist]

    wcss = [np.sum(d**2) for d in dist]
    tss = np.sum(pdist(X)**2) / X.shape[0]
    bss = tss - wcss

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)

    ax.set_title(title, fontsize=title_fontsize)
    ax.plot(cluster_ranges, bss / tss * 100, 'b*-')
    ax.grid(True)
    ax.set_xlabel('Number of clusters', fontsize=text_fontsize)
    ax.set_ylabel('Percent variance explained', fontsize=text_fontsize)
    ax.tick_params(labelsize=text_fontsize)

    return ax
Exemplo n.º 33
0
 def _init_projectors(self, X):       
     self.projectors = [clone(bproj) for bproj in self.bprojs]
Exemplo n.º 34
0
def learning_curve(estimator,
                   X_train,
                   X_valid,
                   score="f1",
                   train_sizes=None,
                   hparams=None,
                   shuffle=False,
                   random_state=None):
    """ Given training and validation data, this function produces learning curves
        (as lists of scores) for a given estimator.

    Args:
        estimator: a model to be inspected
        X_train (list(AnnotatedDocument)): training data
        X_valid (list(AnnotatedDocument)): validation data
        score: the type of scores to be produced, one of {'precision', 'recall', 'f1'}
        train_sizes (list(float)): relative sizes of training subsets
        hparams (dict): hyper-parameters to be passed to a model for training
        shuffle (bool): if True, training data is shuffled
        random_state (int): used when shuffle=True to ensure reproducible results

    Returns:
        train_sizes (list(float)): relative sizes of training subsets
        train_scores (list(float)): model scores on training subsets of respective sizes
        valid_scores (list(float)): model scores on validation data
    """

    # check model type
    if isinstance(estimator, NamedEntityRecognitionModel):
        annotation_type = "annotation"
        if isinstance(estimator, ModelEnsembleNER):
            annotation_labels = set()
            for model in estimator.models:
                annotation_labels.update(model.entity_labels)
            annotation_labels = list(annotation_labels)
        else:
            annotation_labels = estimator.entity_labels
    elif isinstance(estimator, RelationExtractionModel):
        annotation_type = "relation"
        if isinstance(estimator, REModelEnsemble):
            annotation_labels = set()
            for model in estimator.models:
                annotation_labels.update(model.relation_labels)
            annotation_labels = list(annotation_labels)
        else:
            annotation_labels = estimator.relation_labels
    else:
        raise TypeError(
            "Given estimator is of type '{}' which is not supported".format(
                type(estimator)))

    # determine annotation label
    if annotation_labels:
        if len(annotation_labels) > 1:
            log.debug(
                "Learning curves currently support either one label or all labels: building for all labels"
            )
            annotation_label = None
        else:
            annotation_label = annotation_labels[0]
    else:
        annotation_label = None

    # make default train sizes as fractions
    if not train_sizes:
        train_sizes = [s * 0.1 for s in range(1, 11)]

    # shuffle training data if necessary
    if shuffle:
        if random_state:
            random.Random(random_state).shuffle(X_train)
        else:
            random.shuffle(X_train)

    # collect scores for each training subset
    train_scores = []
    valid_scores = []

    for train_size in train_sizes:
        docs_to_train = X_train[:int(train_size * len(X_train))]
        if not docs_to_train:
            log.debug("No documents to train: check your train sizes")

        base_estimator = clone(estimator)

        if hparams:
            base_estimator.fit(X=docs_to_train, y=None, **hparams)
        else:
            base_estimator.fit(X=docs_to_train, y=None)

        X_train_pred = base_estimator.transform(docs_to_train)
        X_valid_pred = base_estimator.transform(X_valid)

        score_train = annotation_precision_recall_f1score(
            X_train_pred,
            docs_to_train,
            ann_label=annotation_label,
            ann_type=annotation_type)

        score_valid = annotation_precision_recall_f1score(
            X_valid_pred,
            X_valid,
            ann_label=annotation_label,
            ann_type=annotation_type)

        if score == "precision":
            train_scores.append(score_train[0])
            valid_scores.append(score_valid[0])
        elif score == "recall":
            train_scores.append(score_train[1])
            valid_scores.append(score_valid[1])
        elif score == "f1":
            train_scores.append(score_train[2])
            valid_scores.append(score_valid[2])
        else:
            raise ValueError(
                "Cannot determine the type of scoring '{}'".format(score))

    return train_sizes, train_scores, valid_scores
Exemplo n.º 35
0
def test_clonable(est):
    # fit it, then clone it
    est.fit(y)
    est2 = clone(est)
    assert isinstance(est2, est.__class__)
    assert est is not est2
Exemplo n.º 36
0
    def _fit(self, X, y, feature_axis=2):
        X, y = check_ts_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[feature_axis]

        if self.max_features is not None:
            if not isinstance(self.max_features, numbers.Integral):
                raise TypeError(
                    "'max_features' should be an integer between 1 and {} features."
                    " Got {!r} instead.".format(n_features, self.max_features))
            elif self.max_features < 1 or self.max_features > n_features:
                raise ValueError(
                    "'max_features' should be between 1 and {} features."
                    " Got {} instead.".format(n_features, self.max_features))
            max_features = self.max_features
        else:
            max_features = n_features

        if not isinstance(self.n_gen_no_change,
                          (numbers.Integral, np.integer, type(None))):
            raise ValueError(
                "'n_gen_no_change' should either be None or an integer."
                " {} was passed.".format(self.n_gen_no_change))

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual",
                         tools.initRepeat,
                         creator.Individual,
                         toolbox.attr_bool,
                         n=n_features)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)
        toolbox.register("evaluate",
                         _evalFunction,
                         gaobject=self,
                         estimator=estimator,
                         X=X,
                         y=y,
                         cv=cv,
                         scorer=scorer,
                         verbose=self.verbose,
                         fit_params=self.fit_params,
                         max_features=max_features,
                         caching=self.caching,
                         feature_axis=feature_axis)
        toolbox.register("mate",
                         tools.cxUniform,
                         indpb=self.crossover_independent_proba)
        toolbox.register("mutate",
                         tools.mutFlipBit,
                         indpb=self.mutation_independent_proba)
        toolbox.register("select",
                         tools.selTournament,
                         tournsize=self.tournament_size)

        if self.n_jobs == 0:
            raise ValueError("n_jobs == 0 has no meaning.")
        elif self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(
                processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        _, log = _eaFunction(pop,
                             toolbox,
                             cxpb=self.crossover_proba,
                             mutpb=self.mutation_proba,
                             ngen=self.n_generations,
                             ngen_no_change=self.n_gen_no_change,
                             stats=stats,
                             halloffame=hof,
                             verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        _X = apply_mask(X, support_, feature_axis=feature_axis)
        self.estimator_.fit(_X, y)

        self.generation_scores_ = np.array(
            [score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
Exemplo n.º 37
0
    def _validate_estimator(self):
        # FIXME: in 0.6 call super()
        SparseBaseSMOTE._validate_estimator(self)
        # FIXME: remove in 0.6 after deprecation cycle
        if self.kind != 'deprecated' and not (self.kind == 'borderline-1'
                                              or self.kind == 'borderline-2'):
            if self.kind not in SMOTE_KIND:
                raise ValueError('Unknown kind for SMOTE algorithm.'
                                 ' Choices are {}. Got {} instead.'.format(
                                     SMOTE_KIND, self.kind))
            else:
                warnings.warn(
                    '"kind" is deprecated in 0.4 and will be '
                    'removed in 0.6. Use SMOTE, BorderlineSMOTE or '
                    'SVMSMOTE instead.', DeprecationWarning)

            if self.kind == 'borderline1' or self.kind == 'borderline2':
                self._sample = types.MethodType(SparseBorderlineSMOTE._sample,
                                                self)
                self.kind = ('borderline-1'
                             if self.kind == 'borderline1' else 'borderline-2')

            elif self.kind == 'svm':
                self._sample = types.MethodType(SparseSVMSMOTE._sample, self)

                if self.out_step == 'deprecated':
                    self.out_step = 0.5
                else:
                    warnings.warn(
                        '"out_step" is deprecated in 0.4 and will '
                        'be removed in 0.6. Use SVMSMOTE class '
                        'instead.', DeprecationWarning)

                if self.svm_estimator == 'deprecated':
                    warnings.warn(
                        '"svm_estimator" is deprecated in 0.4 and '
                        'will be removed in 0.6. Use SVMSMOTE class '
                        'instead.', DeprecationWarning)
                if (self.svm_estimator is None
                        or self.svm_estimator == 'deprecated'):
                    self.svm_estimator_ = SVC(gamma='scale',
                                              random_state=self.random_state)
                elif isinstance(self.svm_estimator, SVC):
                    self.svm_estimator_ = clone(self.svm_estimator)
                else:
                    raise_isinstance_error('svm_estimator', [SVC],
                                           self.svm_estimator)

            if self.kind != 'regular':
                if self.m_neighbors == 'deprecated':
                    self.m_neighbors = 10
                else:
                    warnings.warn(
                        '"m_neighbors" is deprecated in 0.4 and '
                        'will be removed in 0.6. Use SVMSMOTE class '
                        'or BorderlineSMOTE instead.', DeprecationWarning)

                self.nn_m_ = check_neighbors_object('m_neighbors',
                                                    self.m_neighbors,
                                                    additional_neighbor=1)
                self.nn_m_.set_params(**{'n_jobs': self.n_jobs})
Exemplo n.º 38
0
    def __init__(self,
                 *,
                 model_y='auto',
                 model_t='auto',
                 featurizer=None,
                 discrete_treatment=False,
                 categories='auto',
                 cv=2,
                 n_crossfit_splits='raise',
                 mc_iters=None,
                 mc_agg='mean',
                 n_estimators=100,
                 criterion="mse",
                 max_depth=None,
                 min_samples_split=10,
                 min_samples_leaf=5,
                 min_weight_fraction_leaf=0.,
                 min_var_fraction_leaf=None,
                 min_var_leaf_on_val=True,
                 max_features="auto",
                 min_impurity_decrease=0.,
                 max_samples=.45,
                 min_balancedness_tol=.45,
                 honest=True,
                 inference=True,
                 fit_intercept=True,
                 subforest_size=4,
                 n_jobs=-1,
                 random_state=None,
                 verbose=0,
                 warm_start=False):

        # TODO: consider whether we need more care around stateful featurizers,
        #       since we clone it and fit separate copies
        self.model_y = clone(model_y, safe=False)
        self.model_t = clone(model_t, safe=False)
        self.featurizer = clone(featurizer, safe=False)
        self.discrete_instrument = discrete_treatment
        self.categories = categories
        self.cv = cv
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.min_var_fraction_leaf = min_var_fraction_leaf
        self.min_var_leaf_on_val = min_var_leaf_on_val
        self.max_features = max_features
        self.min_impurity_decrease = min_impurity_decrease
        self.max_samples = max_samples
        self.min_balancedness_tol = min_balancedness_tol
        self.honest = honest
        self.inference = inference
        self.fit_intercept = fit_intercept
        self.subforest_size = subforest_size
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.warm_start = warm_start
        self.n_crossfit_splits = n_crossfit_splits
        if self.n_crossfit_splits != 'raise':
            cv = self.n_crossfit_splits
        super().__init__(discrete_treatment=discrete_treatment,
                         categories=categories,
                         cv=cv,
                         n_splits=n_crossfit_splits,
                         mc_iters=mc_iters,
                         mc_agg=mc_agg,
                         random_state=random_state)
Exemplo n.º 39
0
 def _gen_featurizer(self):
     return clone(self.featurizer, safe=False)
Exemplo n.º 40
0
 def __init__(self, model_Y_W, model_T_W, model_T_WZ):
     self._model_Y_W = clone(model_Y_W, safe=False)
     self._model_T_W = clone(model_T_W, safe=False)
     self._model_T_WZ = clone(model_T_WZ, safe=False)
Exemplo n.º 41
0
 def _gen_ortho_learner_model_final(self):
     return _BaseDMLIVModelFinal(
         _FinalWrapper(clone(self.model_final, safe=False),
                       fit_cate_intercept=False,
                       featurizer=clone(self.featurizer, safe=False),
                       use_weight_trick=True))
Exemplo n.º 42
0
    def fit(self, X, y):
        """Fit Gaussian process regression model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples, [n_output_dims])
            Target values

        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
                * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_ = clone(self.kernel)

        self.rng = check_random_state(self.random_state)

        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)

        # Normalize target value
        if self.normalize_y:
            self.y_train_mean = np.mean(y, axis=0)
            # demean y
            y = y - self.y_train_mean
        else:
            self.y_train_mean = np.zeros(1)

        if np.iterable(self.alpha) \
           and self.alpha.shape[0] != y.shape[0]:
            if self.alpha.shape[0] == 1:
                self.alpha = self.alpha[0]
            else:
                raise ValueError(
                    "alpha must be a scalar or an array"
                    " with same number of entries as y.(%d != %d)" %
                    (self.alpha.shape[0], y.shape[0]))

        self.X_train_ = np.copy(X) if self.copy_X_train else X
        self.y_train_ = np.copy(y) if self.copy_X_train else y

        if self.optimizer is not None and self.kernel_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=True):
                if eval_gradient:
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            # First optimize starting from theta specified in kernel
            optima = [(self._constrained_optimization(obj_func,
                                                      self.kernel_.theta,
                                                      self.kernel_.bounds))]

            # Additional runs are performed from log-uniform chosen initial
            # theta
            if self.n_restarts_optimizer > 0:
                if not np.isfinite(self.kernel_.bounds).all():
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = self.kernel_.bounds
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = \
                        self.rng.uniform(bounds[:, 0], bounds[:, 1])
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            self.kernel_.theta = optima[np.argmin(lml_values)][0]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(self.kernel_.theta)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K = self.kernel_(self.X_train_)
        K[np.diag_indices_from(K)] += self.alpha
        self.L_ = cholesky(K, lower=True)  # Line 2
        self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3

        return self
Exemplo n.º 43
0
def classification(estimator, cv, X, y, groups=None, perm=None, n_jobs=1):
    """Do a classification.

    Parameters:
        estimator: a classifier object from sklearn

        cv: a cross-validation object from sklearn

        X: The Data, array of size n_samples x n_features

        y: the labels, array of size n_samples

        groups: optional, groups for groups based cross-validations

        perm: optional, None means no permutations will be computed
            otherwise set her the number of permutations

        n_jobs: optional, default: 1, number of threads to use during
            for the cross-validations. higher means faster. setting to -1 will use
            all available threads - Warning: may sow down computer.

    Returns:
        save: a dictionnary countaining:
            acc_score: the mean score across all cross-validations using the
            accuracy scoring method
            auc_score: the mean score across all cross-validations using the
            roc_auc scoring method
            acc: the list of all cross-validations accuracy scores
            auc: the list of all cross-validations roc_auc scores

        if permutation is not None it also countains:
            auc_pvalue: the pvalue using roc_auc as a scoring method
            acc_pvalue: the pvalue using accuracy as a scoring method
            auc_pscores: a list of all permutation auc scores
            acc_pscores: a list of all permutation accuracy scores

    """
    y = np.asarray(y)
    X = np.asarray(X)
    if len(X) != len(y):
        raise ValueError("Dimension mismatch for X and y : {}, {}".format(
            len(X), len(y)))
    if groups is not None:
        try:
            if len(y) != len(groups):
                raise ValueError("dimension mismatch for groups and y")
        except TypeError:
            print(
                "Error in classification: y or",
                "groups is not a list or similar structure",
            )
            exit()
    clf = clone(estimator)
    accuracies, aucs = cross_val_scores(clf, cv, X, y, groups, n_jobs)
    acc_score = np.mean(accuracies)
    auc_score = np.mean(aucs)
    save = {
        "acc_score": [acc_score],
        "auc_score": [auc_score],
        "acc": accuracies,
        "auc": aucs,
        "n_splits": cv.get_n_splits(X, y, groups),
    }
    if perm is not None:
        acc_pscores, auc_pscores = permutation_test(clf, cv, X, y, groups,
                                                    perm, n_jobs)
        acc_pvalue = compute_pval(acc_score, acc_pscores)
        auc_pvalue = compute_pval(auc_score, auc_pscores)

        save.update({
            "auc_pvalue": auc_pvalue,
            "acc_pvalue": acc_pvalue,
            "auc_pscores": auc_pscores,
            "acc_pscores": acc_pscores,
        })

    return save
Exemplo n.º 44
0
    def fit(self, X, y):
        """Fit a receptive field model.

        Parameters
        ----------
        X : array, shape (n_times[, n_epochs], n_features)
            The input features for the model.
        y : array, shape (n_times[, n_epochs][, n_outputs])
            The output features for the model.

        Returns
        -------
        self : instance
            The instance so you can chain operations.
        """
        if self.scoring not in _SCORERS.keys():
            raise ValueError('scoring must be one of %s, got'
                             '%s ' % (sorted(_SCORERS.keys()), self.scoring))
        from sklearn.base import clone
        X, y, _, self._y_dim = self._check_dimensions(X, y)

        if self.tmin > self.tmax:
            raise ValueError('tmin (%s) must be at most tmax (%s)' %
                             (self.tmin, self.tmax))
        # Initialize delays
        self.delays_ = _times_to_delays(self.tmin, self.tmax, self.sfreq)

        # Define the slice that we should use in the middle
        self.valid_samples_ = _delays_to_slice(self.delays_)

        if isinstance(self.estimator, numbers.Real):
            if self.fit_intercept is None:
                self.fit_intercept = True
            estimator = TimeDelayingRidge(self.tmin,
                                          self.tmax,
                                          self.sfreq,
                                          alpha=self.estimator,
                                          fit_intercept=self.fit_intercept,
                                          n_jobs=self.n_jobs,
                                          edge_correction=self.edge_correction)
        elif is_regressor(self.estimator):
            estimator = clone(self.estimator)
            if self.fit_intercept is not None and \
                    estimator.fit_intercept != self.fit_intercept:
                raise ValueError(
                    'Estimator fit_intercept (%s) != initialization '
                    'fit_intercept (%s), initialize ReceptiveField with the '
                    'same fit_intercept value or use fit_intercept=None' %
                    (estimator.fit_intercept, self.fit_intercept))
            self.fit_intercept = estimator.fit_intercept
        else:
            raise ValueError('`estimator` must be a float or an instance'
                             ' of `BaseEstimator`,'
                             ' got type %s.' % type(self.estimator))
        self.estimator_ = estimator
        del estimator
        _check_estimator(self.estimator_)

        # Create input features
        n_times, n_epochs, n_feats = X.shape
        n_outputs = y.shape[-1]
        n_delays = len(self.delays_)

        # Update feature names if we have none
        if ((self.feature_names is not None)
                and (len(self.feature_names) != n_feats)):
            raise ValueError('n_features in X does not match feature names '
                             '(%s != %s)' % (n_feats, len(self.feature_names)))

        # Create input features
        X, y = self._delay_and_reshape(X, y)

        self.estimator_.fit(X, y)
        coef = get_coef(self.estimator_, 'coef_')  # (n_targets, n_features)
        shape = [n_feats, n_delays]
        if self._y_dim > 1:
            shape.insert(0, -1)
        self.coef_ = coef.reshape(shape)

        # Inverse-transform model weights
        if self.patterns:
            if isinstance(self.estimator_, TimeDelayingRidge):
                cov_ = self.estimator_.cov_ / float(n_times * n_epochs - 1)
                y = y.reshape(-1, y.shape[-1], order='F')
            else:
                X = X - X.mean(0, keepdims=True)
                cov_ = np.cov(X.T)
            del X

            # Inverse output covariance
            if y.ndim == 2 and y.shape[1] != 1:
                y = y - y.mean(0, keepdims=True)
                inv_Y = linalg.pinv(np.cov(y.T))
            else:
                inv_Y = 1. / float(n_times * n_epochs - 1)
            del y

            # Inverse coef according to Haufe's method
            # patterns has shape (n_feats * n_delays, n_outputs)
            coef = np.reshape(self.coef_, (n_feats * n_delays, n_outputs))
            patterns = cov_.dot(coef.dot(inv_Y))
            self.patterns_ = patterns.reshape(shape)

        return self
Exemplo n.º 45
0
    safe_print('SCORES')
    safe_print('%6s' % 'size', end=' | ')

    for name in sorted(names):
        safe_print('%s' % names[name], end=' | ')
    safe_print()

    for size in sizes:
        n = int(np.floor(size / 2))

        X, y = make_friedman1(n_samples=size, random_state=SEED)

        safe_print('%6i' % n, end=' | ')
        for name in sorted(names):
            e = clone(ESTIMATORS[names[name]])
            t0 = time()
            e.fit(X[:n], y[:n])
            t1 = time() - t0
            times[names[name]].append(t1)

            s = rmse(y[n:], e.predict(X[n:]))
            scores[names[name]].append(s)

            safe_print('%8.2f' % (s), end=' | ', flush=True)

        safe_print()

    safe_print('\nFIT TIMES')
    safe_print('%6s' % 'size', end=' | ')
Exemplo n.º 46
0
 def test_lr_scheduler_cloneable(self):
     # reproduces bug #271
     scheduler = LRScheduler(CyclicLR, base_lr=123)
     clone(scheduler)  # does not raise
Exemplo n.º 47
0
def model_efficiency(embs,
                     labels,
                     model=LogisticRegression(),
                     validation=False,
                     reinitialize=True,
                     **params):
    X_train, X_valid, y_train, y_valid = train_test_split(embs,
                                                          labels,
                                                          train_size=0.7,
                                                          random_state=42,
                                                          shuffle=True,
                                                          stratify=labels)

    y_train, y_valid = np.array(y_train), np.array(y_valid)

    if issubclass(type(model),
                  (tensorflow.python.keras.engine.sequential.Sequential,
                   tensorflow.keras.Model)):
        params['validation_data'] = (X_valid, y_valid)

    if reinitialize:
        if issubclass(type(model),
                      (tensorflow.python.keras.engine.sequential.Sequential,
                       tensorflow.keras.Model)):
            model_copy = keras.models.clone_model(model)
            model_copy.build((None, model.input.shape))
            model_copy.compile(loss='binary_crossentropy',
                               optimizer='adam',
                               metrics=['binary_crossentropy', 'accuracy'])
            model = model_copy
        else:
            try:
                model = clone(model)
            except Exception:
                print('model not reinitialized')
                pass

    model.fit(X_train, y_train, **params)

    if hasattr(model, 'predict_proba'):
        proba_predictions_train = model.predict_proba(X_train)
        proba_predictions_valid = model.predict_proba(X_valid)
    else:
        proba_predictions_train = model.predict(X_train)
        proba_predictions_valid = model.predict(X_valid)

    if proba_predictions_train.shape[1] == 2:
        predictions_train = proba_predictions_train.argmax(axis=1)
        predictions_valid = proba_predictions_valid.argmax(axis=1)
    else:
        predictions_train = proba_predictions_train >= 0.5
        predictions_valid = proba_predictions_valid >= 0.5

    loss_train = log_loss(y_train, proba_predictions_train)
    loss_valid = log_loss(y_valid, proba_predictions_valid)

    accuracy_train = accuracy_score(y_train, predictions_train)
    accuracy_valid = accuracy_score(y_valid, predictions_valid)

    f1_train = f1_score(y_train, predictions_train)
    f1_valid = f1_score(y_valid, predictions_valid)

    return {'loss':np.round(loss_train,2), 'accuracy':np.round(accuracy_train,2), 'f1':np.round(f1_train,2)}, \
           {'loss':np.round(loss_valid,2), 'accuracy':np.round(accuracy_valid,2), 'f1':np.round(f1_valid,2)}
Exemplo n.º 48
0
def _crossfit(model, folds, *args, **kwargs):
    """
    General crossfit based calculation of nuisance parameters.

    Parameters
    ----------
    model : object
        An object that supports fit and predict. Fit must accept all the args
        and the keyword arguments kwargs. Similarly predict must all accept
        all the args as arguments and kwards as keyword arguments. The fit
        function estimates a model of the nuisance function, based on the input
        data to fit. Predict evaluates the fitted nuisance function on the input
        data to predict.
    folds : list of tuples or None
        The crossfitting fold structure. Every entry in the list is a tuple whose
        first element are the training indices of the args and kwargs data and
        the second entry are the test indices. If the union of the test indices
        is not the full set of all indices, then the remaining nuisance parameters
        for the missing indices have value NaN.  If folds is None, then cross fitting
        is not performed; all indices are used for both model fitting and prediction
    args : a sequence of (numpy matrices or None)
        Each matrix is a data variable whose first index corresponds to a sample
    kwargs : a sequence of key-value args, with values being (numpy matrices or None)
        Each keyword argument is of the form Var=x, with x a numpy array. Each
        of these arrays are data variables. The model fit and predict will be
        called with signature: `model.fit(*args, **kwargs)` and
        `model.predict(*args, **kwargs)`. Key-value arguments that have value
        None, are ommitted from the two calls. So all the args and the non None
        kwargs variables must be part of the models signature.

    Returns
    -------
    nuisances : tuple of numpy matrices
        Each entry in the tuple is a nuisance parameter matrix. Each row i-th in the
        matrix corresponds to the value of the nuisance parameter for the i-th input
        sample.
    model_list : list of objects of same type as input model
        The cloned and fitted models for each fold. Can be used for inspection of the
        variability of the fitted models across folds.
    fitted_inds : np array1d
        The indices of the arrays for which the nuisance value was calculated. This
        corresponds to the union of the indices of the test part of each fold in
        the input fold list.
    scores : tuple of list of float or None
        The out-of-sample model scores for each nuisance model

    Examples
    --------

    .. testcode::

        import numpy as np
        from sklearn.model_selection import KFold
        from sklearn.linear_model import Lasso
        from econml._ortho_learner import _crossfit
        class Wrapper:
            def __init__(self, model):
                self._model = model
            def fit(self, X, y, W=None):
                self._model.fit(X, y)
                return self
            def predict(self, X, y, W=None):
                return self._model.predict(X)
        np.random.seed(123)
        X = np.random.normal(size=(5000, 3))
        y = X[:, 0] + np.random.normal(size=(5000,))
        folds = list(KFold(2).split(X, y))
        model = Lasso(alpha=0.01)
        nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None)

    >>> nuisance
    (array([-1.105728... , -1.537566..., -2.451827... , ...,  1.106287...,
       -1.829662..., -1.782273...]),)
    >>> model_list
    [<Wrapper object at 0x...>, <Wrapper object at 0x...>]
    >>> fitted_inds
    array([   0,    1,    2, ..., 4997, 4998, 4999])

    """
    model_list = []
    fitted_inds = []
    calculate_scores = hasattr(model, 'score')

    # remove None arguments
    kwargs = filter_none_kwargs(**kwargs)

    if folds is None:  # skip crossfitting
        model_list.append(clone(model, safe=False))
        model_list[0].fit(*args, **kwargs)
        nuisances = model_list[0].predict(*args, **kwargs)
        scores = model_list[0].score(*args, **
                                     kwargs) if calculate_scores else None

        if not isinstance(nuisances, tuple):
            nuisances = (nuisances, )
        if not isinstance(scores, tuple):
            scores = (scores, )

        # scores entries should be lists of scores, so make each entry a singleton list
        scores = tuple([s] for s in scores)

        first_arr = args[0] if args else kwargs.items()[0][1]
        return nuisances, model_list, np.arange(first_arr.shape[0]), scores

    for idx, (train_idxs, test_idxs) in enumerate(folds):
        model_list.append(clone(model, safe=False))
        if len(np.intersect1d(train_idxs, test_idxs)) > 0:
            raise AttributeError(
                "Invalid crossfitting fold structure." +
                "Train and test indices of each fold must be disjoint.")
        if len(np.intersect1d(fitted_inds, test_idxs)) > 0:
            raise AttributeError(
                "Invalid crossfitting fold structure. The same index appears in two test folds."
            )
        fitted_inds = np.concatenate((fitted_inds, test_idxs))

        args_train = tuple(var[train_idxs] if var is not None else None
                           for var in args)
        args_test = tuple(var[test_idxs] if var is not None else None
                          for var in args)

        kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()}
        kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()}

        model_list[idx].fit(*args_train, **kwargs_train)

        nuisance_temp = model_list[idx].predict(*args_test, **kwargs_test)

        if not isinstance(nuisance_temp, tuple):
            nuisance_temp = (nuisance_temp, )

        if idx == 0:
            nuisances = tuple([
                np.full((args[0].shape[0], ) + nuis.shape[1:], np.nan)
                for nuis in nuisance_temp
            ])

        for it, nuis in enumerate(nuisance_temp):
            nuisances[it][test_idxs] = nuis

        if calculate_scores:
            score_temp = model_list[idx].score(*args_test, **kwargs_test)

            if not isinstance(score_temp, tuple):
                score_temp = (score_temp, )

            if idx == 0:
                scores = tuple([] for _ in score_temp)

            for it, score in enumerate(score_temp):
                scores[it].append(score)

    return nuisances, model_list, np.sort(
        fitted_inds.astype(int)), (scores if calculate_scores else None)
Exemplo n.º 49
0
    def fit(self, run_imgs, events=None, confounds=None,
            design_matrices=None):
        """ Fit the GLM

        For each run:
        1. create design matrix X
        2. do a masker job: fMRI_data -> Y
        3. fit regression to (Y, X)

        Parameters
        ----------
        run_imgs: Niimg-like object or list of Niimg-like objects,
            See http://nilearn.github.io/manipulating_images/input_output.html#inputing-data-file-names-or-image-objects  # noqa:E501
            Data on which the GLM will be fitted. If this is a list,
            the affine is considered the same for all.

        events: pandas Dataframe or string or list of pandas DataFrames or
                   strings

            fMRI events used to build design matrices. One events object
            expected per run_img. Ignored in case designs is not None.
            If string, then a path to a csv file is expected.

        confounds: pandas Dataframe or string or list of pandas DataFrames or
                   strings

            Each column in a DataFrame corresponds to a confound variable
            to be included in the regression model of the respective run_img.
            The number of rows must match the number of volumes in the
            respective run_img. Ignored in case designs is not None.
            If string, then a path to a csv file is expected.

        design_matrices: pandas DataFrame or list of pandas DataFrames,
            Design matrices that will be used to fit the GLM. If given it
            takes precedence over events and confounds.

        """
        # Local import to prevent circular imports
        from nilearn.input_data import NiftiMasker  # noqa

        # Check arguments
        # Check imgs type
        if events is not None:
            _check_events_file_uses_tab_separators(events_files=events)
        if not isinstance(run_imgs, (list, tuple)):
            run_imgs = [run_imgs]
        if design_matrices is None:
            if events is None:
                raise ValueError('events or design matrices must be provided')
            if self.t_r is None:
                raise ValueError('t_r not given to FirstLevelModel object'
                                 ' to compute design from events')
        else:
            design_matrices = _check_run_tables(run_imgs, design_matrices,
                                                'design_matrices')
        # Check that number of events and confound files match number of runs
        # Also check that events and confound files can be loaded as DataFrame
        if events is not None:
            events = _check_run_tables(run_imgs, events, 'events')
        if confounds is not None:
            confounds = _check_run_tables(run_imgs, confounds, 'confounds')

        # Learn the mask
        if self.mask_img is False:
            # We create a dummy mask to preserve functionality of api
            ref_img = check_niimg(run_imgs[0])
            self.mask_img = Nifti1Image(np.ones(ref_img.shape[:3]),
                                        ref_img.affine)
        if not isinstance(self.mask_img, NiftiMasker):
            self.masker_ = NiftiMasker(mask_img=self.mask_img,
                                       smoothing_fwhm=self.smoothing_fwhm,
                                       target_affine=self.target_affine,
                                       standardize=self.standardize,
                                       mask_strategy='epi',
                                       t_r=self.t_r,
                                       memory=self.memory,
                                       verbose=max(0, self.verbose - 2),
                                       target_shape=self.target_shape,
                                       memory_level=self.memory_level
                                       )
            self.masker_.fit(run_imgs[0])
        else:
            if self.mask_img.mask_img_ is None and self.masker_ is None:
                self.masker_ = clone(self.mask_img)
                for param_name in ['target_affine', 'target_shape',
                                   'smoothing_fwhm', 't_r', 'memory',
                                   'memory_level']:
                    our_param = getattr(self, param_name)
                    if our_param is None:
                        continue
                    if getattr(self.masker_, param_name) is not None:
                        warn('Parameter %s of the masker'
                             ' overriden' % param_name)
                    setattr(self.masker_, param_name, our_param)
                self.masker_.fit(run_imgs[0])
            else:
                self.masker_ = self.mask_img

        # For each run fit the model and keep only the regression results.
        self.labels_, self.results_, self.design_matrices_ = [], [], []
        n_runs = len(run_imgs)
        t0 = time.time()
        for run_idx, run_img in enumerate(run_imgs):
            # Report progress
            if self.verbose > 0:
                percent = float(run_idx) / n_runs
                percent = round(percent * 100, 2)
                dt = time.time() - t0
                # We use a max to avoid a division by zero
                if run_idx == 0:
                    remaining = 'go take a coffee, a big one'
                else:
                    remaining = (100. - percent) / max(0.01, percent) * dt
                    remaining = '%i seconds remaining' % remaining

                sys.stderr.write(
                    "Computing run %d out of %d runs (%s)\n"
                    % (run_idx + 1, n_runs, remaining))

            # Build the experimental design for the glm
            run_img = check_niimg(run_img, ensure_ndim=4)
            if design_matrices is None:
                n_scans = get_data(run_img).shape[3]
                if confounds is not None:
                    confounds_matrix = confounds[run_idx].values
                    if confounds_matrix.shape[0] != n_scans:
                        raise ValueError('Rows in confounds does not match'
                                         'n_scans in run_img at index %d'
                                         % (run_idx,))
                    confounds_names = confounds[run_idx].columns.tolist()
                else:
                    confounds_matrix = None
                    confounds_names = None
                start_time = self.slice_time_ref * self.t_r
                end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r
                frame_times = np.linspace(start_time, end_time, n_scans)
                design = make_first_level_design_matrix(frame_times,
                                                        events[run_idx],
                                                        self.hrf_model,
                                                        self.drift_model,
                                                        self.high_pass,
                                                        self.drift_order,
                                                        self.fir_delays,
                                                        confounds_matrix,
                                                        confounds_names,
                                                        self.min_onset
                                                        )
            else:
                design = design_matrices[run_idx]
            self.design_matrices_.append(design)

            # Mask and prepare data for GLM
            if self.verbose > 1:
                t_masking = time.time()
                sys.stderr.write('Starting masker computation \r')

            Y = self.masker_.transform(run_img)
            del run_img  # Delete unmasked image to save memory

            if self.verbose > 1:
                t_masking = time.time() - t_masking
                sys.stderr.write('Masker took %d seconds       \n'
                                 % t_masking)

            if self.signal_scaling:
                Y, _ = mean_scaling(Y, self.scaling_axis)
            if self.memory:
                mem_glm = self.memory.cache(run_glm, ignore=['n_jobs'])
            else:
                mem_glm = run_glm

            # compute GLM
            if self.verbose > 1:
                t_glm = time.time()
                sys.stderr.write('Performing GLM computation\r')
            labels, results = mem_glm(Y, design.values,
                                      noise_model=self.noise_model,
                                      bins=100, n_jobs=self.n_jobs)
            if self.verbose > 1:
                t_glm = time.time() - t_glm
                sys.stderr.write('GLM took %d seconds         \n' % t_glm)

            self.labels_.append(labels)
            # We save memory if inspecting model details is not necessary
            if self.minimize_memory:
                for key in results:
                    results[key] = SimpleRegressionResults(results[key])
            self.results_.append(results)
            del Y

        # Report progress
        if self.verbose > 0:
            sys.stderr.write("\nComputation of %d runs done in %i seconds\n\n"
                             % (n_runs, time.time() - t0))

        return self
Exemplo n.º 50
0
 def test_model_clone(self):
     clone_clf = clone(self.clf)
Exemplo n.º 51
0
def _sp_trial(trial_estimator, n_samples, n_features, cov, prec):
    X = _new_sample(n_samples, n_features, cov)
    new_estimator = clone(trial_estimator)
    new_estimator.fit(X)
    return _exact_support(prec, new_estimator.precision_)
Exemplo n.º 52
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        X, y = check_X_y(X, y)
        if not hasattr(self, "ensemble_"):
            self.ensemble_ = []
            self.ensemble_base_ = []

        # Check feature consistency
        if hasattr(self, "X_"):
            if self.X_.shape[1] != X.shape[1]:
                raise ValueError("number of features does not match")

        self.X_, self.y_ = X, y
        if self.oversampled == "None":
            self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "ROS":
            ros = RandomOverSampler(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = ros.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "B2":
            b2 = BorderlineSMOTE(random_state=42, kind='borderline-2')
            try:
                self.dsel_X_, self.dsel_y_ = b2.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "RUS":
            rus = RandomUnderSampler(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_)
                # _, ys_counter = np.unique(self.dsel_y_, return_counts=True)

                # if np.sum(ys_counter) < 9:
                # rus = RandomUnderSampler(random_state=42, sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]})
                # self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "CNN":
            cnn = CondensedNearestNeighbour(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = cnn.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_

        # Check classes
        self.classes_ = classes
        if self.classes_ is None:
            self.classes_, _ = np.unique(y, return_inverse=True)

        # Append new estimator
        self.candidate_ = clone(self.base_estimator).fit(self.X_, self.y_)
        self.ensemble_.append(self.candidate_)
        self.ensemble_base_.extend(self.candidate_.estimators_)

        # Remove the worst when ensemble becomes too large
        if len(self.ensemble_) > self.n_estimators:
            self.prune_index_ = np.argmin(
                [self.metric(y, clf.predict(X)) for clf in self.ensemble_])
            # print(self.prune_index_)
            del self.ensemble_[self.prune_index_]
            a = (((self.prune_index_ + 1) * 10) - 10)
            b = (((self.prune_index_ + 1) * 10))
            del self.ensemble_base_[a:b]
            # print(a, ":", b)

        return self
Exemplo n.º 53
0
def dml_irm_fixture(generate_data_irm, learner, score, dml_procedure,
                    trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 499

    # collect data
    (x, y, d) = generate_data_irm

    # Set machine learning methods for m & g
    ml_g = clone(learner[1])
    ml_m = clone(learner[0])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  trimming_threshold=trimming_threshold)

    dml_irm_obj.fit()

    np.random.seed(3141)
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm(
        y,
        x,
        d,
        clone(learner[0]),
        clone(learner[1]),
        smpls,
        score,
        trimming_threshold=trimming_threshold)

    if dml_procedure == 'dml1':
        res_manual, se_manual = irm_dml1(y, x, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = irm_dml2(y, x, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)

    res_dict = {
        'coef': dml_irm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_irm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1,
                                           m_hat, p_hat, smpls, score,
                                           se_manual, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 54
0
plot_data(X, y, None, ax)

###############################################################################
# Effect of clustering to over-samplers
###############################################################################

###############################################################################
# Clustering based over-sampling allows to identify areas of the input space
# which are appropriate to generate artificial data. Therefore, the generation
# of noisy samples is avoided and the within-classes imbalanced issue is also
# addressed. The next plots show the resampled data when clustering is applied,
# comparing them to the resampled data of the initial over-samplers.

fig, axs = plt.subplots(3, 2, figsize=(15, 15))
for (ax1, ax2), oversampler in zip(axs, OVERSAMPLERS):
    plot_data(X, y, clone(oversampler), ax1)
    plot_data(X, y, ClusterOverSampler(oversampler, KMEANS), ax2)
fig.tight_layout()

###############################################################################
# Performance evaluation of clustering based over-sampling
###############################################################################

###############################################################################
# We are evaluating various over-samplers using F1-score as evaluation metric
# on a test set. The scores with and without clustering are compared.

clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
data = train_test_split(X, y, random_state=RANDOM_STATE)
scores = pd.DataFrame()
for oversampler in OVERSAMPLERS:
Exemplo n.º 55
0
    def fit(self, X=None, y=None):
        n_alpha_grid_points = 5

        self.results_ = np.zeros((n_alpha_grid_points, self.n_grid_points))
        self.grid_ = np.linspace(0.25, 4, self.n_grid_points)
        self.alphas_ = np.linspace(0.99, 0.999, n_alpha_grid_points)[::-1]
        self.ks_ = []

        for aidx, alpha in enumerate(self.alphas_):
            if self.verbose:
                print 'at alpha {} ({}/{})'.format(
                    alpha,
                    aidx,
                    n_alpha_grid_points,
                )

            # draw a new fixed graph for alpha
            cov, prec = _new_graph(self.n_features, alpha)
            n_nonzero_prec = np.count_nonzero(prec.flat)
            self.ks_.append(n_nonzero_prec)
            if self.verbose:
                print '   Graph has {} nonzero entries'.format(n_nonzero_prec)

            for sidx, sample_grid in enumerate(self.grid_):
                n_samples = int(sample_grid * self.n_features)
                
                # model selection (once)
                X = _new_sample(n_samples, self.n_features, cov)
                ms_estimator = clone(self.model_selection_estimator)
                ms_estimator.fit(X)
                lam = getattr(ms_estimator, self.penalty_)
                
                if self.verbose:
                    display_lam = lam
                    if isinstance(lam, np.ndarray):
                        display_lam = np.linalg.norm(lam)
                    print '   ({}/{}), n_samples = {}, selected lambda = {}'.format(
                            sidx,
                            self.n_grid_points,
                            n_samples,
                            display_lam)

                # setup default trial estimator
                if self.trial_estimator is None:
                    trial_estimator = QuicGraphLasso(lam=lam,
                                                     mode='default',
                                                     init_method='corrcoef')
                else:
                    trial_estimator = self.trial_estimator

                # patch trial estimator with this lambda
                trial_estimator.set_params(**{
                    self.penalty: lam, 
                })

                # estimate statistical power
                exact_support_counts = Parallel(
                    n_jobs=self.n_jobs,
                    verbose=False,
                    backend='threading',
                    #max_nbytes=None,
                    #batch_size=1,
                )(
                    delayed(_sp_trial)(
                        trial_estimator, n_samples, self.n_features, cov, prec
                    )
                    for nn in range(self.n_trials))

                self.results_[aidx, sidx] = 1. * np.sum(exact_support_counts) / self.n_trials

            if self.verbose:
                print 'Results at this row: {}'.format(self.results_[aidx, :])

        self.is_fitted = True
        return self
Exemplo n.º 56
0
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

#Binary classifier = stochastic gradient descent SGDClassifier
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train,y_train_5)

sgd_clf.predict([some_digit])

#Cross Validation
skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))
    
#Determine cross validation score - 3 folds
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
#Prediction set selection from cross val
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
#Confusion matrix from prior variable set
Exemplo n.º 57
0
def balanced_batch_generator(X, y, sample_weight=None, sampler=None,
                             batch_size=32, keep_sparse=False,
                             random_state=None):
    """Create a balanced batch generator to train keras model.

    Returns a generator --- as well as the number of step per epoch --- which
    is given to ``fit_generator``. The sampler defines the sampling strategy
    used to balance the dataset ahead of creating the batch. The sampler should
    have an attribute ``return_indices``.

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Original imbalanced dataset.

    y : ndarray, shape (n_samples,) or (n_samples, n_classes)
        Associated targets.

    sample_weight : ndarray, shape (n_samples,)
        Sample weight.

    sampler : object or None, optional (default=RandomUnderSampler)
        A sampler instance which has an attribute ``return_indices``.
        By default, the sampler used is a
        :class:`imblearn.under_sampling.RandomUnderSampler`.

    batch_size : int, optional (default=32)
        Number of samples per gradient update.

    keep_sparse : bool, optional (default=False)
        Either or not to conserve or not the sparsity of the input ``X``. By
        default, the returned batches will be dense.

    {random_state}

    Returns
    -------
    generator : generator of tuple
        Generate batch of data. The tuple generated are either (X_batch,
        y_batch) or (X_batch, y_batch, sampler_weight_batch).

    steps_per_epoch : int
        The number of samples per epoch.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.datasets import load_iris
    >>> X, y = load_iris(return_X_y=True)
    >>> class_dict = dict()
    >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40
    >>> from imblearn.datasets import make_imbalance
    >>> X, y = make_imbalance(X, y, class_dict)
    >>> X = X.astype(np.float32)
    >>> batch_size, learning_rate, epochs = 10, 0.01, 10
    >>> training_generator, steps_per_epoch = balanced_batch_generator(
    ...     X, y, sample_weight=None, sampler=None,
    ...     batch_size=batch_size, random_state=42)
    >>> input_size, output_size = X.shape[1], 3
    >>> import tensorflow as tf
    >>> def init_weights(shape):
    ...     return tf.Variable(tf.random_normal(shape, stddev=0.01))
    >>> def accuracy(y_true, y_pred):
    ...     return np.mean(np.argmax(y_pred, axis=1) == y_true)
    >>> # input and output
    >>> data = tf.placeholder("float32", shape=[None, input_size])
    >>> targets = tf.placeholder("int32", shape=[None])
    >>> # build the model and weights
    >>> W = init_weights([input_size, output_size])
    >>> b = init_weights([output_size])
    >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b)
    >>> # build the loss, predict, and train operator
    >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
    ...     logits=out_act, labels=targets)
    >>> loss = tf.reduce_sum(cross_entropy)
    >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    >>> train_op = optimizer.minimize(loss)
    >>> predict = tf.nn.softmax(out_act)
    >>> # Initialization of all variables in the graph
    >>> init = tf.global_variables_initializer()
    >>> with tf.Session() as sess:
    ...     print('Starting training')
    ...     sess.run(init)
    ...     for e in range(epochs):
    ...         for i in range(steps_per_epoch):
    ...             X_batch, y_batch = next(training_generator)
    ...             feed_dict = dict()
    ...             feed_dict[data] = X_batch; feed_dict[targets] = y_batch
    ...             sess.run([train_op, loss], feed_dict=feed_dict)
    ...         # For each epoch, run accuracy on train and test
    ...         feed_dict = dict()
    ...         feed_dict[data] = X
    ...         predicts_train = sess.run(predict, feed_dict=feed_dict)
    ...         print("epoch: {{}} train accuracy: {{:.3f}}"
    ...               .format(e, accuracy(y, predicts_train)))
    ... # doctest: +ELLIPSIS
    Starting training
    [...

    """

    random_state = check_random_state(random_state)
    if sampler is None:
        sampler_ = RandomUnderSampler(return_indices=True,
                                      random_state=random_state)
    else:
        if not hasattr(sampler, 'return_indices'):
            raise ValueError("'sampler' needs to return the indices of "
                             "the samples selected. Provide a sampler "
                             "which has an attribute 'return_indices'.")
        sampler_ = clone(sampler)
        sampler_.set_params(return_indices=True)
        set_random_state(sampler_, random_state)

    _, _, indices = sampler_.fit_sample(X, y)
    # shuffle the indices since the sampler are packing them by class
    random_state.shuffle(indices)

    def generator(X, y, sample_weight, indices, batch_size):
        while True:
            for index in range(0, len(indices), batch_size):
                X_res = safe_indexing(X, indices[index:index + batch_size])
                y_res = safe_indexing(y, indices[index:index + batch_size])
                if issparse(X_res) and not keep_sparse:
                    X_res = X_res.toarray()
                if sample_weight is None:
                    yield X_res, y_res
                else:
                    sw_res = safe_indexing(sample_weight,
                                           indices[index:index + batch_size])
                    yield X_res, y_res, sw_res

    return (generator(X, y, sample_weight, indices, batch_size),
            int(indices.size // batch_size))
Exemplo n.º 58
0
 def test_clone(self, lung_X, lung_y):
     rfs = RangerForestSurvival(n_estimators=N_ESTIMATORS)
     rfs.fit(lung_X, lung_y)
     clone(rfs)
                       warm_start=True,
                       penalty=None,
                       learning_rate="constant",
                       eta0=0.0005)

minimum_val_error = float("inf")
best_epoch = None
best_model = None
for epoch in range(1000):
    sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    val_error = mean_squared_error(y_val, y_val_predict)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(sgd_reg)

#%% Logistic Regression

from sklearn import datasets
iris = datasets.load_iris()
list(iris.keys())
X = iris["data"][:, 3:]  # petal width
y = (iris["target"] == 2).astype(np.int)  # 1 if Iris virginica, else 0

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X, y)

X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
Exemplo n.º 60
0
def train_model(regressor, X, y):
    regressor_ = clone(regressor)
    regressor_.fit(X, y)
    return regressor_