Python unique 예제들, sklearn.utils.fixes.unique Python 예제들

예제 #1

0

파일 보기

def test_stratified_shuffle_split_iter():
    ys = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
        np.array([-1] * 800 + [1] * 50)
    ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y,
                                          6,
                                          test_size=0.33,
                                          random_state=0,
                                          indices=True)
        for train, test in sss:
            assert_array_equal(unique(y[train]), unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(unique(y[train], return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(unique(y[test], return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])

예제 #2

0

파일 보기

파일: cross_validation.py 프로젝트: jamartinb/kaggle

    def __init__(
        self, y, n_iter=10, test_size=0.1, train_size=None, indices=True, random_state=None, n_iterations=None
    ):

        super(StratifiedShuffleSplit, self).__init__(
            len(y), n_iter, test_size, train_size, indices, random_state, n_iterations
        )
        self.y = np.array(y)
        self.classes, self.y_indices = unique(y, return_inverse=True)
        n_cls = self.classes.shape[0]

        if np.min(np.bincount(self.y_indices)) < 2:
            raise ValueError(
                "The least populated class in y has only 1"
                " member, which is too few. The minimum"
                " number of labels for any class cannot"
                " be less than 2."
            )

        if self.n_train < n_cls:
            raise ValueError(
                "The train_size = %d should be greater or "
                "equal to the number of classes = %d" % (self.n_train, n_cls)
            )
        if self.n_test < n_cls:
            raise ValueError(
                "The test_size = %d should be greater or " "equal to the number of classes = %d" % (self.n_test, n_cls)
            )

예제 #3

0

파일 보기

파일: cross_validation.py 프로젝트: jamartinb/kaggle

 def __init__(self, labels, p, indices=True):
     # We make a copy of labels to avoid side-effects during iteration
     super(LeavePLabelOut, self).__init__(len(labels), indices)
     self.labels = np.array(labels, copy=True)
     self.unique_labels = unique(labels)
     self.n_unique_labels = len(self.unique_labels)
     self.p = p

예제 #4

0

파일 보기

파일: test_svm.py 프로젝트: yiyinianhua/scikit-learn

def test_auto_weight():
    """Test class weights for imbalanced data"""
    from sklearn.linear_model import LogisticRegression
    # We take as dataset the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1.
    # We add one to the targets as a non-regression test: class_weight="auto"
    # used to work only when the labels where a range [0..K).
    from sklearn.utils import compute_class_weight
    X, y = iris.data[:, :2], iris.target + 1
    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])

    classes, y_ind = unique(y[unbalanced], return_inverse=True)
    class_weights = compute_class_weight('auto', classes, y_ind)
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0),
                LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight='auto')
        y_pred_balanced = clf.fit(
            X[unbalanced],
            y[unbalanced],
        ).predict(X)
        assert_true(
            metrics.f1_score(y, y_pred) <= metrics.f1_score(
                y, y_pred_balanced))

예제 #5

0

파일 보기

파일: test_class_weight.py 프로젝트: sarahcodes/scikit-learn

def test_compute_class_weight():
    """Test (and demo) compute_class_weight."""
    y = np.asarray([2, 2, 2, 3, 3, 4])
    classes = unique(y)
    cw = compute_class_weight("auto", classes, y)
    assert_almost_equal(cw.sum(), classes.shape)
    assert_true(cw[0] < cw[1] < cw[2])

예제 #6

0

파일 보기

    def fit(self, X, y, store_covariances=False, tol=1.0e-4):
        """
        Fit the QDA model according to the given training data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array, shape = [n_samples]
            Target values (integers)

        store_covariances : boolean
            If True the covariance matrices are computed and stored in the
            `self.covariances_` attribute.
        """
        X, y = check_arrays(X, y)
        self.classes_, y = unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        cov = None
        if store_covariances:
            cov = []
        means = []
        scalings = []
        rotations = []
        for ind in xrange(n_classes):
            Xg = X[y == ind, :]
            meang = Xg.mean(0)
            means.append(meang)
            Xgc = Xg - meang
            # Xgc = U * S * V.T
            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
            rank = np.sum(S > tol)
            if rank < n_features:
                warnings.warn("Variables are collinear")
            S2 = (S**2) / (len(Xg) - 1)
            if store_covariances:
                # cov = V * (S^2 / (n-1)) * V.T
                cov.append(np.dot(S2 * Vt.T, Vt))
            scalings.append(S2)
            rotations.append(Vt.T)
        if store_covariances:
            self.covariances_ = cov
        self.means_ = np.asarray(means)
        self.scalings_ = np.asarray(scalings)
        self.rotations_ = rotations
        return self

예제 #7

0

파일 보기

파일: safe_leave_p_out.py 프로젝트: IanTheEngineer/Penn-haptics-bolt

 def __iter__(self):
     
     # We make a copy here to avoid side-effects during iteration
     labels = np.array(self.labels, copy=True)
     unique_labels = unique(labels)
     curr_iter = 0
     idx_cache = set()
     num_cache_hits = 0
     max_cache_hits = self.max_cache_hits
     
     #comb = combinations(range(self.n_unique_labels), self.p)
 
     while curr_iter < self.max_iters and num_cache_hits < max_cache_hits:
         
         idx = random_combination(range(self.n_unique_labels),
                                           self.p)
         if idx in idx_cache:
             num_cache_hits += 1
             if num_cache_hits >= max_cache_hits:
                 print "WARNING LeavePLabelOut: number of consecutive cache hits too high, bailing out after %d samples" % curr_iter
             continue
         else:
             num_cache_hits = 0
         idx_cache.add(idx)
         
         idx = np.array(idx)
         
         test_index = np.zeros(labels.size, dtype=np.bool)
         idx = np.array(idx)
         for l in unique_labels[idx]:
             test_index[labels == l] = True
         train_index = np.logical_not(test_index)
         if self.indices:
             ind = np.arange(labels.size)
             train_index = ind[train_index]
             test_index = ind[test_index]
         
         if len(unique(self.train_Y[train_index])) == 1:
             #prevent test sets with only one class
             continue
         
         curr_iter += 1
         yield train_index, test_index

예제 #8

0

파일 보기

파일: test_cross_validation.py 프로젝트: jayjhan8/scikit-learn

def test_stratified_shuffle_split_iter():
    ys = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
        np.array([-1] * 800 + [1] * 50),
    ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, random_state=0)
        for train, test in sss:
            assert_array_equal(unique(y[train]), unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = np.bincount(unique(y[train], return_inverse=True)[1]) / float(len(y[train]))
            p_test = np.bincount(unique(y[test], return_inverse=True)[1]) / float(len(y[test]))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])

예제 #9

0

파일 보기

파일: cross_validation.py 프로젝트: jamartinb/kaggle

def _shuffle(y, labels, random_state):
    """Return a shuffled copy of y eventually shuffle among same labels."""
    if labels is None:
        ind = random_state.permutation(len(y))
    else:
        ind = np.arange(len(labels))
        for label in unique(labels):
            this_mask = labels == label
            ind[this_mask] = random_state.permutation(ind[this_mask])
    return y[ind]

예제 #10

0

파일 보기

파일: safe_leave_p_out.py 프로젝트: IanTheEngineer/Penn-haptics-bolt

 def __init__(self, labels, p, max_iters, train_Y, max_cache_hits = 100,
              indices=True):
     self.labels = labels
     self.unique_labels = unique(self.labels)
     self.n_unique_labels = self.unique_labels.size
     self.p = p
     self.indices = indices
     self.train_Y = train_Y
     self.max_iters = max_iters
     self.max_cache_hits = max_cache_hits

예제 #11

0

파일 보기

파일: safe_leave_p_out.py 프로젝트: zhesu/Penn-haptics-bolt

    def __iter__(self):

        # We make a copy here to avoid side-effects during iteration
        labels = np.array(self.labels, copy=True)
        unique_labels = unique(labels)
        curr_iter = 0
        idx_cache = set()
        num_cache_hits = 0
        max_cache_hits = self.max_cache_hits

        #comb = combinations(range(self.n_unique_labels), self.p)

        while curr_iter < self.max_iters and num_cache_hits < max_cache_hits:

            idx = random_combination(range(self.n_unique_labels), self.p)
            if idx in idx_cache:
                num_cache_hits += 1
                if num_cache_hits >= max_cache_hits:
                    print "WARNING LeavePLabelOut: number of consecutive cache hits too high, bailing out after %d samples" % curr_iter
                continue
            else:
                num_cache_hits = 0
            idx_cache.add(idx)

            idx = np.array(idx)

            test_index = np.zeros(labels.size, dtype=np.bool)
            idx = np.array(idx)
            for l in unique_labels[idx]:
                test_index[labels == l] = True
            train_index = np.logical_not(test_index)
            if self.indices:
                ind = np.arange(labels.size)
                train_index = ind[train_index]
                test_index = ind[test_index]

            if len(unique(self.train_Y[train_index])) == 1:
                #prevent test sets with only one class
                continue

            curr_iter += 1
            yield train_index, test_index

예제 #12

0

파일 보기

    def _validate_y(self, y):
        y = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        for k in xrange(self.n_outputs_):
            classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])

        return y

예제 #13

0

파일 보기

파일: test_cross_validation.py 프로젝트: PepGardiola/scikit-learn

def test_stratified_shuffle_split():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    ys = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
        np.array([-1] * 800 + [1] * 50)
        ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0, indices=True)
        for train, test in sss:
            assert_array_equal(unique(y[train]), unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = np.bincount(
                unique(y[train], return_inverse=True)[1]
                ) / float(len(y[train]))
            p_test = np.bincount(
                unique(y[test], return_inverse=True)[1]
                ) / float(len(y[test]))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])

예제 #14

0

파일 보기

파일: safe_leave_p_out.py 프로젝트: zhesu/Penn-haptics-bolt

 def __init__(self,
              labels,
              p,
              max_iters,
              train_Y,
              max_cache_hits=100,
              indices=True):
     self.labels = labels
     self.unique_labels = unique(self.labels)
     self.n_unique_labels = self.unique_labels.size
     self.p = p
     self.indices = indices
     self.train_Y = train_Y
     self.max_iters = max_iters
     self.max_cache_hits = max_cache_hits

예제 #15

0

파일 보기

파일: cross_validation.py 프로젝트: jamartinb/kaggle

 def __init__(self, y, n_folds=3, indices=True, k=None):
     super(StratifiedKFold, self).__init__(len(y), n_folds, indices, k)
     y = np.asarray(y)
     _, y_sorted = unique(y, return_inverse=True)
     min_labels = np.min(np.bincount(y_sorted))
     if self.n_folds > min_labels:
         warnings.warn(
             (
                 "The least populated class in y has only %d"
                 " members, which is too few. The minimum"
                 " number of labels for any class cannot"
                 " be less than n_folds=%d." % (min_labels, self.n_folds)
             ),
             Warning,
         )
     self.y = y

예제 #16

0

파일 보기

파일: test_k_means.py 프로젝트: jdetras/parliament2

def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0],
                  [0, 0, 0, 0], [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]])

    km = KMeans(n_clusters=3,
                init=bad_centers,
                n_init=1,
                max_iter=10,
                random_state=1)
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)

예제 #17

0

파일 보기

파일: ensemble.py 프로젝트: etamponi/resilient-protocol

    def fit(self, inp, y):
        self.precomputed_probs_ = None
        self.precomputed_weights_ = None

        self.classes_, y = unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        self.random_state_ = check_random_state(self.random_state)

        if self.pipeline is not None:
            inp = self.pipeline.fit_transform(inp)

        self.weighting_strategy.prepare(inp, y)
        self.classifiers_ = self.training_strategy.train_estimators(
            self.n_estimators, inp, y,
            self.weighting_strategy, self.random_state_
        )

        # Reset it to null because the previous line uses self.predict
        self.precomputed_probs_ = None
        self.precomputed_weights_ = None
        return self

예제 #18

0

파일 보기

파일: labeled_bootstraping.py 프로젝트: ruffsl/CS7616P1

 def fit(self, X, y, labels=None):
     """Fit a forest of trees from the training set X and y"""
     # Poll the randome state from the forest
     random_state = check_random_state(self.random_state)
     # Reshape y to preserve the data contiguity
     y = np.reshape(y, (-1, 1))
     # Get the dimentions of X
     n_samples, self.n_features_ = X.shape
     # Get the number of outputs for morphing y later
     self.n_outputs_ = y.shape[1]
     # Make a container for all unique classes
     self.classes_ = []
     # Make a container for number of instances of each unique classe
     self.n_classes_ = []
     # For each output of y
     for k in xrange(self.n_outputs_):
         # Get the unique classe lables and an array of indexs pointing to the lable
         classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
         # Store the unique classe lables
         self.classes_.append(classes_k)
         # And store the unique classe lables' length
         self.n_classes_.append(classes_k.shape[0])
     # Check if we need/can do OOB estimation
     if not self.bootstrap and self.oob_score:
         raise ValueError("Can't use OOB estimation " 
                          "if bootstraping is not enabled")
     # Precalculate the random seeds for all trees
     n_trees = self.n_estimators
     seeds = random_state.randint(MAX_INT, size=n_trees)
     # Grow the forest given the
     self.estimators_ = grow_forest(self, X, y, seeds, labels)
     # Check if we need/can do OOB estimation
     if self.oob_score:
         # If so, then do it
         self.get_oob_score(X, y)
     # Decapsulate attributes if only have one output to consider
     if hasattr(self, "classes_") and self.n_outputs_ == 1:
         self.n_classes_ = self.n_classes_[0]
         self.classes_ = self.classes_[0]
     return self

예제 #19

0

파일 보기

파일: test_k_means.py 프로젝트: Big-Data/scikit-learn

def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0],
                  [0, 0, 0, 0],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0,  1,  0,  0],
                            [.2,  0, .2, .2],
                            [+0,  0,  0,  0]])

    km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10,
                random_state=1)
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)

예제 #20

0

파일 보기

파일: test_svm.py 프로젝트: dubourg/scikit-learn

def test_auto_weight():
    """Test class weights for imbalanced data"""
    from sklearn.linear_model import LogisticRegression

    # We take as dataset the two-dimensional projection of iris so
    # that it is not separable and remove half of predictors from
    # class 1.
    # We add one to the targets as a non-regression test: class_weight="auto"
    # used to work only when the labels where a range [0..K).
    from sklearn.utils import compute_class_weight

    X, y = iris.data[:, :2], iris.target + 1
    unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2])

    classes, y_ind = unique(y[unbalanced], return_inverse=True)
    class_weights = compute_class_weight("auto", classes, y_ind)
    assert_true(np.argmax(class_weights) == 2)

    for clf in (svm.SVC(kernel="linear"), svm.LinearSVC(random_state=0), LogisticRegression()):
        # check that score is better when class='auto' is set.
        y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        clf.set_params(class_weight="auto")
        y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
        assert_true(metrics.f1_score(y, y_pred) <= metrics.f1_score(y, y_pred_balanced))

예제 #21

0

파일 보기

    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_, y = unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)

        return y

예제 #22

0

파일 보기

파일: bag.py 프로젝트: orazaro/kgml

    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_, y = unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)

        return y

예제 #23

0

파일 보기

파일: labeled_bootstraping.py 프로젝트: ruffsl/CS7616P1

 def fit(self, X, y, check_input=True, sample_weight=None):
     # Poll the randome state from the tree
     random_state = check_random_state(self.random_state)
     # If the data hasn't yet been formated
     if check_input:
         # Then convert the X data
         X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
     # Get the dimentions of X
     n_samples, self.n_features_ = X.shape
     # Make sure that y is a 1d and not a id.T
     y = np.atleast_1d(y)
     # If our output is 1d
     if y.ndim == 1:
         # Reshape y to preserve the data contiguity
         y = np.reshape(y, (-1, 1))
     # Get the number of outputs
     self.n_outputs_ = y.shape[1]
     y = np.copy(y)
     # Make a container for all unique classes
     self.classes_ = []
     # Make a container for number of instances of each unique classe
     self.n_classes_ = []
     # For each output of y
     for k in xrange(self.n_outputs_):
         # Get the unique classe lables and an array of indexs pointing to the lable
         classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
         # Store the unique classe lables
         self.classes_.append(classes_k)
         # And store the unique classe lables' length
         self.n_classes_.append(classes_k.shape[0])
     # Lets make this numpy array type ints for speed
     self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
     if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
         y = np.ascontiguousarray(y, dtype=DOUBLE)
     # Check parameters
     # If no maxdepth was given
     max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth
     # If defult was given 
     if isinstance(self.max_features, six.string_types):
         # then set it to the sqrt of number of features 
         max_features = max(1, int(np.sqrt(self.n_features_)))
     # If None was given 
     elif self.max_features is None:
         # Just use all of them
         max_features = self.n_features_
     # Otherwise
     else:
         # Use whats given
         max_features = self.max_features
     # We we we're given a sample weight
     if sample_weight is not None:
         # Then  we'll nedd to make sure its double precision
         if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous):
             sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
     min_samples_split = self.min_samples_split
     criterion = self.criterion
     # If we have not yet inti our tree criterion
     if criterion is None:
         # Lets inti our entropy criterion
         criterion = Entropy(self.n_outputs_, self.n_classes_)
     splitter = self.splitter
     # If we have not yet inti our tree splitter
     if splitter is None:
         # Lets inti our best binary splitter
         splitter = BestSplitter(criterion, max_features, self.min_samples_leaf, random_state)
     # We'll save these so we don't have to init them agian a second time for retraining
     self.criterion_ = criterion
     self.splitter_ = splitter
     # Now lets init
     self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state)
     # and fit our tree database
     self.tree_.build(X, y, sample_weight=sample_weight)
     # If we only have one output
     if self.n_outputs_ == 1:
         # Then just save the first class
         self.n_classes_ = self.n_classes_[0]
         self.classes_ = self.classes_[0]
     # Then save our tree
     return self

예제 #24

0

파일 보기

파일: plot_gradient_boosting_regularization.py 프로젝트: dhaba/scikit-docset

# Author: Peter Prettenhofer <*****@*****.**>
#
# License: BSD 3 clause

import numpy as np
import pylab as pl
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils.fixes import unique

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = unique(y, return_inverse=True)

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

original_params = {
    'n_estimators': 1000,
    'max_depth': 2,
    'random_state': 1,
    'min_samples_split': 5
}

pl.figure()

for label, color, setting in [('No shrinkage', 'orange', {
        'learning_rate': 1.0,

예제 #25

0

파일 보기

파일: tree.py 프로젝트: rexshihaoren/MSPrediction-Python

    def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.",
                DeprecationWarning,
            )

        # Convert data
        if check_input:
            X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    "Invalid value for max_features. Allowed string " 'values are "auto", "sqrt" or "log2".'
                )
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous:
                sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError(
                    "Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)
                )

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(
            self.n_features_,
            self.n_classes_,
            self.n_outputs_,
            splitter,
            max_depth,
            min_samples_split,
            self.min_samples_leaf,
            random_state,
        )

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self

예제 #26

0

파일 보기

파일: lr.py 프로젝트: ISCASDataTeam1/PU-learning

 def fit(self, X, y):
     self.classes_, indices = unique(y, return_inverse=True)
     self.theta_ = logistic.fast_logistic_gradient_descent(X, y, max_iter=self.n_iter, eta0=self.eta0, alpha=self.alpha, learning_rate=self.learning_rate)
     return self

예제 #27

0

파일 보기

파일: fisher.py 프로젝트: andrewjohnlowe/JetImages

    def fit_multiclass(self, X, y, use_total_scatter=False, solution_norm="N", sigma_sqrd=1e-8, tol=1.0e-3, print_timing=False):
        """
        Fit the Fisher Discriminant model according to the given training data and parameters.
        Based on (but depending on options not exactly the same as) "Algorithm 4" in
        Zhang, et. al. 'Regularized Discriminant Analysis, Ridge Regression and Beyond' Journal of Machine Learning Research 11 (2010) 2199-2228
        NOTE: setting norm_covariance=False and use_total_scatter=True, and solution_norm = 'A' or 'B' will give the algorithm from paper

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array, shape = [n_samples]
            Target values (integers)
        use_total_scatter : boolean
            If True then use total scatter matrix St = Sum_i (x_i - m)(x_i - m).T instead of Sw
            If False, use Sw = Sum_{c=1... n_classes} Sum_{i; x in class c} norm_c (x_i - m_c)(x_i - m_c).T
                      where norm_c = 1/N_samples_class_c if norm_covariance=True, else norm_c = 1
        solution_norm: boolean
            3 kinds of norms, "A", "B", or "N", were "N" means normalize to 1.  "A" and "B" (see paper reference) have normalizations
            that may be important when consitering n_classes > 2
        sigma_sqrd:  float
            smooth regularization parameter, which is size of singular value where smoothing becomes important.
            NOTE: is fraction in case norm_covariance=False, as a priori the scale of the singular values is not known in this case
        tol:  float
            used for truncated SVD of Sw.  Essentially a form of regularization.  Tol for SVD(R) is 1e-6, fixed right now
        print_timing: boolean
            print time for several matrix operations in the algorithm
        """
        X, y = check_arrays(X, y, sparse_format='dense')
        self.classes_, y = unique( y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        n_samples_perclass = np.bincount(y)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        if not any( np.array(["A","B","N"])==solution_norm ):
             print 'WARNING: solution_norm must be one of ["A","B","N"]! Exiting'
             sys.exit(2)

        ts = time.time()
                    
        self.means_ = []
        for ind in xrange(n_classes):
            Xg = X[y == ind, :]
            meang = Xg.mean(0)
            self.means_.append(np.asarray(meang))
        if print_timing: print 'fit_multiclass: means took', time.time() - ts

        ts = time.time()
        PI_diag = np.diag( 1.0*n_samples_perclass )                                       # shape(PI_diag) = n_classes x n_classes
        PI_inv = np.diag( 1.0 / (1.0*n_samples_perclass) )                                # shape(PI_inv) = n_classes x n_classes
        PI_sqrt_inv = np.sqrt( PI_inv )                                                   # shape(PI_sqrt_inv) = n_classes x n_classes
        #H = np.identity(n_samples) - (1.0/(1.0*n_samples))*np.ones((n_samples,n_samples))
        E=np.zeros( (n_samples,n_classes) )
        E[[range(n_samples),y]]=1
        if print_timing: print 'fit_multiclass: matrices took', time.time() - ts


        ts = time.time()
        #note: computation of this is fast, can always do it inline, if memory consumption gets large
        Xt_H = X.T - (1.0/(1.0*n_samples))*np.repeat( np.array([X.T.sum(1)]).T, n_samples, axis=1)    # shape(Xt_H) = n_features x n_samples
        if print_timing: print 'fit_multiclass: Xt_H took', time.time() - ts

        ts = time.time()
        #####################################################################################################################
        #Sb = X.T * H * E * PI_inv * E.T * H * X = (X.T * H * E * PI_sqrt_inv) * (X.T * H * E * PI_sqrt_inv).T
        #if norm_covariance: Sb = X.T * H * E * PI_inv * PI_inv * E.T * H * X = (X.T * H * E * PI_inv) * (X.T * H * E * PI_inv).T
        #This norm actually doesn't matter in 2-class, I think it jsut becomes an overall scaling, which gets normalized away
        #I expect id doesn't matter for multiclass either... but not sure
        #to be clear, multi-class fisher does not norm! but then its harder to set the regularization factor for Sw
        #####################################################################################################################

        Xt_H_E_PIsi = None                                                      # shape(Xt_H_E_PIsi) = n_features x n_classes
        if self.norm_covariance:
           Xt_H_E_PIsi =  np.dot(Xt_H, np.dot(E, PI_inv) )
        else:
           Xt_H_E_PIsi = np.dot(Xt_H, np.dot(E, PI_sqrt_inv) )
        if print_timing: print 'fit_multiclass: Xt_H_E_PIsi took', time.time() - ts

        
        #St_reg = ( np.dot(X.T np.dot(H, X)) - (sigma*sigma)*np.identity(n_features))

        ts = time.time()
        #####################################################################################################################
        #Sw = X.T * [ 1 - E*PI_inv*E.T ] * X = X.T * X - M.T * PI * M
        # if norm_covariance: Sw = X.T * [ P - E*PI_inv*PI_inv*E.T ] * X = X.T *P * X - M.T * M
        #####################################################################################################################
        M = np.asarray(self.means_)                                              # shape(M) = n_classes x n_features
        #P = np.diag( np.dot(E, 1.0/(1.0*n_samples_perclass)) )
        P_vec = np.array([np.dot(E, 1.0/(1.0*n_samples_perclass))]).T            # shape(P_vec) = n_samples x 1
        Sw=None                                                                  # shape(Sw) = n_features x n_features 
        if not use_total_scatter:
            if self.norm_covariance:
                #Sw = np.inner( np.inner(X.T, P), X.T) - np.dot( M.T, M)
                Sw = np.inner( (P_vec*X).T, X.T) - np.dot( M.T, M)
            else:
                Sw = np.inner(X.T, X.T) - np.dot( M.T, np.dot(PI_diag, M))
                
            if print_timing: print 'fit_multiclass: Sw took', time.time() - ts

        #####################################################################################################################
        #assume (I think true) for condensed svd, where we only take vectors for non-zero singular values
        #that if M is symmetric, then Uc=Vc where condensed_svd(M) = Uc * Sc * Vc.T
        #this is because the singular values of a symmetric matrix are the abosolute values of the non-zero eigenvalues
        #so assuming the singular vectors of the non-zero singular values are the same as eigen vectors
        #and since condensed svd only keeps singular vectors for non-zero singular values, should have Uc==Vc
        #####################################################################################################################


        ts = time.time()
        Uc, Sc, Utc, Sc_norm = None, None, None, None
        if use_total_scatter:
            St_norm = (1.0/(1.0*n_samples)) if self.norm_covariance else 1.0
            Uc, Sc, Utc, Sc_norm = self.condensed_svd( St_norm * np.inner(Xt_H, X.T), tol, store_singular_vals=True )
        else:
            Uc, Sc, Utc, Sc_norm = self.condensed_svd( Sw, tol, store_singular_vals=True )
        if print_timing: print 'fit_multiclass: Uc, Sc, Utc took', time.time() - ts

        ts = time.time()
        #scale up sigma to appropriate range of singular values
        reg_factor = sigma_sqrd * Sc_norm 
        St_reg_inv = np.dot( Uc, np.dot(np.diag(1.0/(Sc + reg_factor)), Utc) )    # shape(St_reg_inv) = n_features x n_features
        if print_timing: print 'fit_multiclass: St_reg_inv took', time.time() - ts

        ts = time.time()
        G = np.dot(St_reg_inv, Xt_H_E_PIsi)                                       # shape(G) = n_features x n_classes
        if print_timing: print 'fit_multiclass: G took', time.time() - ts

        ts = time.time()
        R = np.dot( Xt_H_E_PIsi.T, G)                                             # shape(R) = n_classes x n_classes
        if print_timing: print 'fit_multiclass: R took', time.time() - ts

        ts = time.time()
        Vr, Lr, Vtr, Lr_norm =  self.condensed_svd( R, tol=1e-6 )                 # shape(Vr) = n_classes x rank_R
        if print_timing: print 'fit_multiclass: Vr, Lr, Vtr took', time.time() - ts
        
        ts = time.time()
        W = np.dot( G, Vr)                                                        # shape(W) = n_features x rank_R
        if print_timing: print 'fit_multiclass: B took', time.time() - ts
        
        if solution_norm=="A":
            W = np.dot(W, np.diag(1.0 / np.sqrt(Lr)) )

        elif solution_norm=="N":
            for i in range( W.shape[1] ):
                if linalg.norm(W[:,i]) != 0:
                    W[:,i] /= linalg.norm(W[:,i])
                else:
                    print "WARNING: Fisher discriminant line has norm=0 --> no discriminating curved found! Exiting"
                    sys.exit(2)

        
        self.w_ = W.T  #transpose here just because want to store the matrix where rows have length n_features, i.e. are discriminants 

        return self

예제 #28

0

파일 보기

파일: fisher.py 프로젝트: andrewjohnlowe/JetImages

    def fit(self, X, y, store_covariance=False, tol=1.0e-4,
            do_smooth_reg=False, cov_class=None, cov_power=1):
        """
        Fit the Fisher Discriminant model according to the given training data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array, shape = [n_samples]
            Target values (integers)
        store_covariance : boolean
            If True the covariance matrix of each class and each iteration is computed
            and stored in `self.covs_` attribute. has dimensions [n_iterations][2] where 2 is for nclasses = 2
        tol:  float
            used for regularization, either for svd series truncation or smoothing.
        do_smooth_reg: boolean
            If False, truncate SVD matrix inversion for singular values less then tol.
            If True, apply smooth regularization (filter factor) on inversion, such that 1/s_i --> s_i/(s_i^2 + tol^2), where s_i is singular value
        """
        X, y = check_arrays(X, y, sparse_format='dense')
        self.classes_, y = unique( (y>0), return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        self.means_ = []
        self.covs_  = []
        
        wvecs = []

        # Group means n_classes*n_features matrix

        means = []
        nevt = np.zeros(n_classes)
        Xc = []
        Xg = []
        covs = []
        cov = None
            
        for ind in xrange(n_classes):
            Xg = X[y == ind, :]
            meang = Xg.mean(0)
            means.append(meang)
            nevt[ind] = Xg.shape[0]
                
        # centered group data
            if cov_class is None or cov_class == ind:
                Xgc = Xg - meang
                covg = np.zeros((n_features, n_features))
                covg += np.dot(Xgc.T, Xgc)
                covs.append(covg)
             

        # check rank of Sb = m * m.T
        # if rank = 0, we are in null space of Sb, and can not calculate fisher component
        m = means[0] - means[1]
        if linalg.norm(m) ==0:
            print "WARNING: Inter-class matrix is zero, i.e. classes have same mean!"
            print "         Fisher can not discriminate in this case --> Exiting"
            sys.exit(2)
            
        Sb = np.outer( m, m )
        #svdvalsSb = linalg.svdvals( Sb )
        #rank = np.sum( svdvalsSb > tol )
        #print "rank Sb = ",rank            

        self.means_.append( np.asarray(means) )

        #covs_array = [ np.asarray(covs[0]) , np.asarray(covs[1]) ]
        covs_array = [np.asarray(cc) for cc in covs]
        if self.norm_covariance:
            for ii in range(len(covs_array)):
                covs_array[ii] /= ( (nevt[ii]-1.0) if nevt[ii] > 1 else 1 )
#            covs_array[0] /= ( (nevt[0]-1.0) if nevt[0] > 1 else 1 )
#            covs_array[1] /= ( (nevt[1]-1.0) if nevt[1] > 1 else 1 )

        if store_covariance:
            self.covs_.append( covs_array )

        #if norm_covariance:
        #    nevt[0] = nevt[0] if nevt[0] > 1 else 2
        #    nevt[1] = nevt[1] if nevt[1] > 1 else 2
        #    self.covs_.append( [ np.asarray(covs[0]) / (nevt[0]-1.0), np.asarray(covs[1]) / (nevt[1]-1.0) ] )
        #else:
        #    self.covs_.append( [ np.asarray(covs[0]), np.asarray(covs[1]) ] )

        #Sw = covs_array[0] + covs_array[1]
        Sw = sum(covs_array)

        #----------------------------
        # for 2 class system, need to solve for w in
        # Sb * w = lambda * Sw * w
        # where lambda is eigenvalue of this generalized eigenvalue problem
        # however, Sb * w = m mT * w = m * constant
        # implies we only need to solve m = Sw * w   
        # (overall constant wet later with ||w||=1 )
        # solution: Sw = U*S*Vh using svd ==> S.inv*U.T*m = Vh *w ==> w = Sum_i^rank(S) vh_i * (U.T * m)_i / S_i
        # where vh_i is a vector
        #----------------------------
        # step 1)  svd of Sw
        # step 2) calculate sum for all non singular components
        U, S, V = linalg.svd(Sw)        

        rank = np.sum(S > tol)
        #print "rank Sw = ", rank

        S = np.power(S, cov_power)
       
        UTm = np.inner(U.T, m)
        w = np.zeros(n_features)
        for i in range(len(S)):
            if do_smooth_reg==True:
                w += V[i,:] * UTm[i] * ( S[i] / (S[i]*S[i]+ tol**(2*cov_power)) )
                #w += V[i,:] * UTm[i] * ( S[i] / (S[i]*S[i] + tol*tol) )
            else:
                if S[i] < tol: 
                    continue
                w += V[i,:] * UTm[i] / S[i]

        if linalg.norm(w) != 0:
            w /= linalg.norm(w)
        else:
            print "WARNING: Fisher discriminant line has norm=0 --> no discriminating curved found! Exiting"
            sys.exit(2)
            
        #check if signal (1) projection smaller than bkg (0), if so, add minus sign
        if(np.inner(means[1],w) < np.inner(means[0],w)):
            w *= (-1.0)

        wvecs.append( w ) 

        
        self.w_ = np.asarray(wvecs)
        self.n_components_found_ = len(self.w_)
        self.singular_vals = S

        return self

예제 #29

0

파일 보기

파일: lr.py 프로젝트: ISCASDataTeam1/PU-learning

 def fit(self, X, y):
     self.classes_, indices = unique(y, return_inverse=True)
     self.theta_ = logistic.lbfgs_logistic_regression(X, y, alpha=self.alpha, n_iter=self.n_iter)
     return self

예제 #30

0

파일 보기

파일: lr.py 프로젝트: ISCASDataTeam1/PU-learning

 def fit(self, X, y):
     self.classes_, indices = unique(y, return_inverse=True)
     self.minimumC_ = float(np.sum(y)) / len(y)
     self.q_ = (1.0 / (1.0 - self.minimumC_)) - 1.0
     self.b_, self.w_ = logistic.posonly_multinomial_logistic_gradient_descent(X, y, max_iter=self.n_iter, eta0=self.eta0, c=self.c)
     return self

예제 #31

0

파일 보기

파일: test_class_weight.py 프로젝트: pmnyc/Data_Engineering_Collections

def test_compute_class_weight():
    """Test (and demo) compute_class_weight."""
    classes, y = unique(np.asarray([2, 2, 2, 3, 3, 4]), return_inverse=True)
    cw = compute_class_weight("auto", classes, y)
    assert_almost_equal(cw.sum(), classes.shape)
    assert_true(cw[0] < cw[1] < cw[2])

예제 #32

0

파일 보기

파일: fisher.py 프로젝트: andrewjohnlowe/JetImages

    def fit(self, X, y):
        """
        Fit the Kernelized Fisher Discriminant model according to the given training data and parameters.
        Based on "Algorithm 5" in
        Zhang, et. al. 'Regularized Discriminant Analysis, Ridge Regression and Beyond' Journal of Machine Learning Research 11 (2010) 2199-2228
        NOTE: setting norm_covariance=False and use_total_scatter=True, and solution_norm = 'A' or 'B' will give the algorithm from paper

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array, shape = [n_samples]
            Target values (integers)
        
        """
        X, y = check_arrays(X, y, sparse_format='dense')
        self.classes_, y = unique( y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        n_samples_perclass = np.bincount(y)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        ts = time.time()
                    
        self.means_ = []
        for ind in xrange(n_classes):
            Xg = X[y == ind, :]
            meang = Xg.mean(0)
            self.means_.append(np.asarray(meang))
        if self.print_timing: print 'KernelFisher.fit: means took', time.time() - ts


        ts = time.time()
        PI_diag = np.diag( 1.0*n_samples_perclass )                                        # shape(PI_diag) = n_classes x n_classes
        PI_inv = np.diag( 1.0 / (1.0*n_samples_perclass) )                                 # shape(PI_inv) = n_classes x n_classes
        PI_sqrt_inv = np.sqrt( PI_inv )                                                    # shape(PI_sqrt_inv) = n_classes x n_classes
        #H = np.identity(n_samples) - (1.0/(1.0*n_samples))*np.ones((n_samples,n_samples))
        E=np.zeros( (n_samples,n_classes) )                                                # shape(E) = n_samples x n_classes
        E[[range(n_samples),y]]=1
        E_PIsi = np.dot(E, PI_sqrt_inv)
        One_minus_E_Pi_Et = np.identity(n_samples) - np.inner( E, np.inner(PI_diag, E).T ) # shape(One_minus_E_Pi_Et) = n_samples x n_samples
        if self.print_timing: print 'KernelFisher.fit: matrices took', time.time() - ts


        #####################################################################################################################
        #C = HKH = (I - 1/n 1x1.T) K (I - 1/n 1x1.T) = (K -  1xK_mean.T) * (I - 1/n 1x1.T)
        #        = K - K_meanx1.T - 1xK_mean.T + K_allmean 1x1
        #  --> which is the same as what self._centerer.fit_transform(C) performs
        #
        # if use_total_scatter=False,
        #      then using Sw which is (1-E*Pi*E.T)K(1-E*Pi*E.T)
        #####################################################################################################################
        ts = time.time()
        C = self._get_kernel(X) 
        K_mean = np.sum(C, axis=1) / (1.0*C.shape[1])

        if self.use_total_scatter:
            C = self._centerer.fit_transform(C)
        else:
            C = np.inner( One_minus_E_Pi_Et, np.inner(C, One_minus_E_Pi_Et).T)
        if self.print_timing: print 'KernelFisher.fit: Kernel Calculation took', time.time() - ts


        ts = time.time()
        Uc, Sc, Utc, Sc_norm = self.condensed_svd( C, self.tol, store_singular_vals=True )
        if self.print_timing: print 'KernelFisher.fit: Uc, Sc, Utc took', time.time() - ts


        ts = time.time()
        #scale up sigma to appropriate range of singular values
        reg_factor = self.sigma_sqrd * Sc_norm 
        St_reg_inv = np.inner( Uc, np.inner(np.diag(1.0/(Sc + reg_factor)), Utc.T).T )   
        if self.print_timing: print 'KernelFisher.fit: St_reg_inv took', time.time() - ts

        ts = time.time()
        R = np.inner(E_PIsi.T, np.inner(C, np.inner( St_reg_inv, E_PIsi.T ).T ).T )
        if self.print_timing: print 'KernelFisher.fit: R took', time.time() - ts


        ts = time.time()
        Vr, Lr, Vtr, Lr_norm =  self.condensed_svd( R, tol=1e-6 )                
        if self.print_timing: print 'KernelFisher.fit: Vr, Lr, Vtr took', time.time() - ts


        ts = time.time()
        #####################################################################################################################
        #This capital Z is Upsilon.T * H from equation (22)
        #####################################################################################################################
        #Z = np.inner( np.diag(1.0 / np.sqrt(Lr)), np.inner(Vtr, np.inner(E_PIsi.T, np.inner(C, St_reg_inv.T ).T ).T ).T )
        Z = np.inner( np.inner( np.inner( np.inner( np.diag(1.0 / np.sqrt(Lr)), Vtr.T), E_PIsi), C.T), St_reg_inv)

        Z = (Z.T - (Z.sum(axis=1) / (1.0*Z.shape[1])) ).T
        if self.print_timing: print 'KernelFisher.fit: Z took', time.time() - ts

        self.Z = Z
        self.n_components_found_ = Z.shape[0]

        #####################################################################################################################
        #This K_mean is (1/n) K*1_n from equation (22)
        #####################################################################################################################
        self.K_mean = K_mean

        #print Z.shape, K_mean.shape, self.n_components_found_

        self.X_fit_ = X
        return self

예제 #33

0

파일 보기

파일: plot_gradient_boosting_regularization.py 프로젝트: 2011200799/scikit-learn

# Author: Peter Prettenhofer <*****@*****.**>
#
# License: BSD 3 clause

import numpy as np
import pylab as pl
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils.fixes import unique


X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = unique(y, return_inverse=True)

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

original_params = {'n_estimators': 1000, 'max_depth': 2, 'random_state': 1,
                   'min_samples_split': 5}

pl.figure()

for label, color, setting in [('No shrinkage', 'orange',
                               {'learning_rate': 1.0, 'subsample': 1.0}),
                              ('learning_rate=0.1', 'turquoise',
                               {'learning_rate': 0.1, 'subsample': 1.0}),
                              ('subsample=0.5', 'blue',
                               {'learning_rate': 1.0, 'subsample': 0.5}),

예제 #34

0

파일 보기

파일: lr.py 프로젝트: ISCASDataTeam1/PU-learning

 def fit(self, X, y):
     self.classes_, indices = unique(y, return_inverse=True)
     self.theta_, self.b_ = logistic.fast_modified_logistic_gradient_descent(X, y, max_iter=self.n_iter, eta0=self.eta0, b=self.b)
     return self