Exemplo n.º 1
0
def pairwise_distances_no_broadcast(X, Y):
    """Utility function to calculate row-wise euclidean distance of two matrix.
    Different from pair-wise calculation, this function would not broadcast.

    For instance, X and Y are both (4,3) matrices, the function would return
    a distance vector with shape (4,), instead of (4,4).

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        First input samples

    Y : array of shape (n_samples, n_features)
        Second input samples

    Returns
    -------
    distance : array of shape (n_samples,)
        Row-wise euclidean distance of X and Y
    """
    X = check_array(X)
    Y = check_array(Y)

    if X.shape[0] != Y.shape[0] or X.shape[1] != Y.shape[1]:
        raise ValueError("pairwise_distances_no_broadcast function receive"
                         "matrix with different shapes {0} and {1}".format(
            X.shape, Y.shape))
    return _pairwise_distances_no_broadcast_helper(X, Y)
Exemplo n.º 2
0
    def predict(self, X, categorical=None):
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            New data to predict.
        categorical : Indices of columns that contain categorical data

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."

        if categorical is not None:
            assert isinstance(categorical, (int, list, tuple)), "The 'categorical' \
                argument needs to be an integer with the index of the categorical \
                column in your data, or a list or tuple of several of them, \
                but it is a {}.".format(type(categorical))

        X = pandas_to_numpy(X)
        Xnum, Xcat = _split_num_cat(X, categorical)
        Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
        Xcat, _ = encode_features(Xcat, enc_map=self._enc_map)
        return _labels_cost(Xnum, Xcat, self._enc_cluster_centroids,
                            self.num_dissim, self.cat_dissim, self.gamma)[0]
Exemplo n.º 3
0
def check_array_with_weights(X, weights, **kwargs):
    """Utility to validate data and weights.

    This calls check_array on X and weights, making sure results match.
    """
    if weights is None:
        return check_array(X, **kwargs), weights

    # Always use copy=False for weights
    kwargs_weights = dict(kwargs)
    kwargs_weights.update(copy=False)
    weights = check_array(weights, **kwargs_weights)

    # Always use force_all_finite=False for X
    kwargs_X = dict(kwargs)
    kwargs_X.update(force_all_finite=False)
    X = check_array(X, **kwargs_X)

    # Make sure shapes match and missing data has weights=0
    if X.shape != weights.shape:
        raise ValueError("Shape of `X` and `weights` should match")

    Wzero = (weights == 0)
    X[Wzero] = 0

    if not np.all(np.isfinite(X)):
        raise ValueError("Input contains NaN or infinity without "
                         "a corresponding zero in `weights`.")
    return X, weights
Exemplo n.º 4
0
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    # Check if dimensions are consistent.
    val.check_consistent_length(T, Y)
    T = val.check_array(T)
    Y = val.check_array(Y)
    print(T)
    print(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return _weighted_sum(loss, sample_weight, normalize)
Exemplo n.º 5
0
  def _process_inputs(self, X, constraints):

    self.X_ = X = check_array(X)

    # check to make sure that no two constrained vectors are identical
    a,b,c,d = constraints
    no_ident = vector_norm(X[a] - X[b]) > 1e-9
    a, b = a[no_ident], b[no_ident]
    no_ident = vector_norm(X[c] - X[d]) > 1e-9
    c, d = c[no_ident], d[no_ident]
    if len(a) == 0:
      raise ValueError('No non-trivial similarity constraints given for MMC.')
    if len(c) == 0:
      raise ValueError('No non-trivial dissimilarity constraints given for MMC.')

    # init metric
    if self.A0 is None:
      self.A_ = np.identity(X.shape[1])
      if not self.diagonal:
        # Don't know why division by 10... it's in the original code
        # and seems to affect the overall scale of the learned metric.
        self.A_ /= 10.0
    else:
      self.A_ = check_array(self.A0)

    return a,b,c,d
Exemplo n.º 6
0
def _impose_f_order(X):
    """Helper Function"""
    # important to access flags instead of calling np.isfortran,
    # this catches corner cases.
    if X.flags.c_contiguous:
        return check_array(X.T, copy=False, order='F'), True
    else:
        return check_array(X, copy=False, order='F'), False
Exemplo n.º 7
0
 def _prepare_inputs(self, X, W):
   self.X_ = X = check_array(X)
   W = check_array(W, accept_sparse=True)
   # set up prior M
   if self.use_cov:
     self.M_ = pinvh(np.cov(X, rowvar = False))
   else:
     self.M_ = np.identity(X.shape[1])
   L = laplacian(W, normed=False)
   return X.T.dot(L.dot(X))
Exemplo n.º 8
0
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)
        n_samples, n_features = X.shape

        # check parameters
        # number of clusters are default to 8
        self._validate_estimator(default=MiniBatchKMeans(
            n_clusters=self.n_clusters,
            random_state=self.random_state))

        self.clustering_estimator_.fit(X=X, y=y)
        # Get the labels of the clustering results
        # labels_ is consistent across sklearn clustering algorithms
        self.cluster_labels_ = self.clustering_estimator_.labels_
        self.cluster_sizes_ = np.bincount(self.cluster_labels_)
        self._set_cluster_centers(X, n_features)
        self._set_small_large_clusters(n_samples)

        self.decision_scores_ = self._decision_function(X,
                                                        self.cluster_labels_)

        self._process_decision_scores()
        return self
Exemplo n.º 9
0
Arquivo: hdda.py Projeto: mfauvel/HDDA
    def predict_proba(self, X):
        """
        Predict the membership probabilities for the data samples
        in X using trained model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        proba : array, shape (n_samples, n_clusters)
        """
        X = check_array(X, copy=False, order='C', dtype=sp.float64)
        K = self.score_samples(X)
        T = sp.empty_like(K)

        # Compute the Loglikelhood
        K *= (0.5)

        # Compute the posterior
        with sp.errstate(over='ignore'):
            for c in xrange(self.C):
                T[:, c] = 1 / sp.exp(K-K[:, c][:, sp.newaxis]).sum(axis=1)

        return T
Exemplo n.º 10
0
Arquivo: hdda.py Projeto: mfauvel/HDDA
    def score_samples(self, X, y=None):
        """Compute the negative weighted log probabilities for each sample.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        log_prob : array, shape (n_samples, n_clusters)
            Log probabilities of each data point in X.
        """
        X = check_array(X, copy=False, order='C', dtype=sp.float64)
        nt, d = X.shape
        K = sp.empty((nt, self.C))

        # Start the prediction for each class
        for c in xrange(self.C):
            # Compute the constant term
            K[:, c] = self.logdet[c] - 2*sp.log(self.prop[c]) + self.cst

            # Remove the mean
            Xc = X - self.mean[c]

            # Do the projection
            Px = sp.dot(Xc,
                        sp.dot(self.Q[c], self.Q[c].T))
            temp = sp.dot(Px, self.Q[c]/sp.sqrt(self.a[c]))
            K[:, c] += sp.sum(temp**2, axis=1)
            K[:, c] += sp.sum((Xc - Px)**2, axis=1)/self.b[c]

        return -K
Exemplo n.º 11
0
Arquivo: hdda.py Projeto: mfauvel/HDDA
    def score(self, X, y=None):
        """Compute the per-sample log-likelihood of the given data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_dimensions)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        Returns
        -------
        log_likelihood : float
            Log likelihood of the Gaussian mixture given X.

        """

        X = check_array(X, copy=False, order='C', dtype=sp.float64)

        # Get some parameters
        n = X.shape[0]

        # Compute the membership function
        K = self.score_samples(X)

        # Compute the Loglikelhood
        K *= (0.5)
        Km = K.max(axis=1)
        Km.shape = (n, 1)

        # Logsumexp trick
        LL = (sp.log(sp.exp(K-Km).sum(axis=1))[:, sp.newaxis]+Km).sum()

        return LL
Exemplo n.º 12
0
Arquivo: pca.py Projeto: jakevdp/wpca
    def fit_transform(self, X, y=None):
        """Fit the model with X and apply the dimensionality reduction on X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            New data, where n_samples in the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)
        """
        X = check_array(X)
        if self.n_components is None:
            n_components = X.shape[1]
        else:
            n_components = self.n_components

        self.mean_ = X.mean(0)
        U, s, VT = np.linalg.svd(X - self.mean_)
        self.components_ = VT[:n_components]
        var = s ** 2 / X.shape[0]
        self.explained_variance_ = var[:self.n_components]
        self.explained_variance_ratio_ = var[:n_components] / var.sum()
        return s[:n_components] * U[:, :n_components]
Exemplo n.º 13
0
    def predict(self, X):
        """Predict class for X.

        Parameters
        ----------
        X : Array-like of shape [n_samples, n_features]
            The input to classify.

        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """

        X = check_array(X)

        if self.trees_ is None:
            raise Exception("Pattern trees not initialized. Perform a fit first.")

        y_classes = np.zeros((X.shape[0], len(self.classes_)))
        for i, c in enumerate(self.classes_):
            y_classes[:, i] = self.trees_[i](X)

        # predict the maximum value
        return self.classes_.take(np.argmax(y_classes, -1))
Exemplo n.º 14
0
  def fit(self, X, y, random_state=np.random):
    """Create constraints from labels and learn the SDML model.

    Parameters
    ----------
    X : array-like, shape (n, d)
        data matrix, where each row corresponds to a single instance
    y : array-like, shape (n,)
        data labels, one for each instance
    random_state : {numpy.random.RandomState, int}, optional
        Random number generator or random seed. If not given, the singleton
        numpy.random will be used.

    Returns
    -------
    self : object
        Returns the instance.
    """
    y = check_array(y, ensure_2d=False)
    num_constraints = self.num_constraints
    if num_constraints is None:
      num_classes = len(np.unique(y))
      num_constraints = 20 * num_classes**2

    c = Constraints.random_subset(y, self.num_labeled,
                                  random_state=random_state)
    adj = c.adjacency_matrix(num_constraints, random_state=random_state)
    return SDML.fit(self, X, adj)
Exemplo n.º 15
0
    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError("The base estimator should implement a predict method")

        X = check_array(X, accept_sparse=True)

        y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X)
                                         for e in self.estimators_)

        return np.asarray(y).T
Exemplo n.º 16
0
    def predict_learned(self, X, return_std=False):
        X = check_array(X)

        # Predict based on GP posterior
        K_trans = self.kernel_(X, self.X_train_learned)
        y_mean = K_trans.dot(self.alpha_learned)  # Line 4 (y_mean = f_star)
        y_mean = self.y_train_mean_learned + y_mean  # undo normal.
        if return_std:
            # compute inverse K_inv of K based on its Cholesky
            # decomposition L and its inverse L_inv
            L_inv = solve_triangular(self.L_learned.T, np.eye(self.L_learned.shape[0]))
            K_inv = L_inv.dot(L_inv.T)
            # Compute variance of predictive distribution
            y_var = self.kernel_.diag(X)
            y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv)

            # Check if any of the variances is negative because of
            # numerical issues. If yes: set the variance to 0.
            y_var_negative = y_var < 0
            if np.any(y_var_negative):
                warnings.warn("Predicted variances smaller than 0. "
                        "Setting those variances to 0.")
                y_var[y_var_negative] = 0.0
            return y_mean, np.sqrt(y_var)
        else:
            return y_mean
Exemplo n.º 17
0
def chi2(X, y):
    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = MultiLabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)          # n_classes * n_features

    feature_count = check_array(X.sum(axis=0))
    class_prob = check_array(Y.mean(axis=0))
    expected = np.dot(class_prob.T, feature_count)

    return _chisquare(observed, expected)
def chi2_contingency_matrix(X_train, y_train):
    X = X_train.copy()
    X.data = np.ones_like(X.data)

    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = LabelBinarizer().fit_transform(y_train)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)  # n_classes * n_features

    # feature_count = check_array(X.sum(axis=0))
    # class_prob = check_array(Y.mean(axis=0))
    feature_count = X.sum(axis=0).reshape(1, -1)
    class_prob = Y.mean(axis=0).reshape(1, -1)
    expected = np.dot(class_prob.T, feature_count)

    observed = np.asarray(observed, dtype=np.float64)

    k = len(observed)
    # Reuse observed for chi-squared statistics
    contingency_matrix = observed
    contingency_matrix -= expected
    contingency_matrix **= 2

    expected[expected == 0.0] = 1.0

    contingency_matrix /= expected

    # weights = contingency_matrix.max(axis=0)

    return contingency_matrix
Exemplo n.º 19
0
    def fit(self, X, y=None):
        """Don't trust the documentation of this module!

        Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : array-like or CSR matrix with shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.
        """
        X = check_array(X, copy=self.copy, accept_sparse="csc",
                         ensure_2d=False)
        if warn_if_not_float(X, estimator=self):
            # Costly conversion, but otherwise the pipeline will break:
            # https://github.com/scikit-learn/scikit-learn/issues/1709
            X = X.astype(np.float32)
        if sparse.issparse(X):
            if self.center_sparse:
                means = []
                vars = []

                # This only works for csc matrices...
                for i in range(X.shape[1]):
                    if X.indptr[i] == X.indptr[i + 1]:
                        means.append(0)
                        vars.append(1)
                    else:
                        vars.append(
                            X.data[X.indptr[i]:X.indptr[i + 1]].var())
                        # If the variance is 0, set all occurences of this
                        # features to 1
                        means.append(
                            X.data[X.indptr[i]:X.indptr[i + 1]].mean())
                        if 0.0000001 >= vars[-1] >= -0.0000001:
                            means[-1] -= 1

                self.std_ = np.sqrt(np.array(vars))
                self.std_[np.array(vars) == 0.0] = 1.0
                self.mean_ = np.array(means)

                return self
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                self.mean_ = None

            if self.with_std:
                var = mean_variance_axis(X, axis=0)[1]
                self.std_ = np.sqrt(var)
                self.std_[var == 0.0] = 1.0
            else:
                self.std_ = None
            return self
        else:
            self.mean_, self.std_ = _mean_and_std(
                X, axis=0, with_mean=self.with_mean, with_std=self.with_std)
            return self
Exemplo n.º 20
0
 def predict_presence_absence_evidences(self, X):
     
     X = check_array(X, accept_sparse="csr")
     
     absence_log_prob_ = np.log(1 - np.exp(self.feature_log_prob_))
     
     presence_log_ratios = self.feature_log_prob_[1] - self.feature_log_prob_[0]
     absence_log_ratios = absence_log_prob_[1] - absence_log_prob_[0]
     
     presence_neg_log_ratios = presence_log_ratios * (presence_log_ratios<0)
     presence_pos_log_ratios = presence_log_ratios * (presence_log_ratios>0)
     if issparse(X):
         p_neg_evi = X * presence_neg_log_ratios
         p_pos_evi = X * presence_pos_log_ratios
     else:
         p_neg_evi = np.dot(X, presence_neg_log_ratios)
         p_pos_evi = np.dot(X, presence_pos_log_ratios)
     
     absence_neg_log_ratios = absence_log_ratios * (absence_log_ratios<0)
     absence_pos_log_ratios = absence_log_ratios * (absence_log_ratios>0)
     default_a_neg_evi = absence_neg_log_ratios.sum()
     default_a_pos_evi = absence_pos_log_ratios.sum()
     if issparse(X):
         a_neg_evi = -(X * absence_neg_log_ratios) + default_a_neg_evi
         a_pos_evi = -(X * absence_pos_log_ratios) + default_a_pos_evi
     else:
         a_neg_evi = -np.dot(X, absence_neg_log_ratios) + default_a_neg_evi
         a_pos_evi = -np.dot(X, absence_pos_log_ratios) + default_a_pos_evi
     
     return p_neg_evi, p_pos_evi, a_neg_evi, a_pos_evi
Exemplo n.º 21
0
    def transform(self, X):
        """ A reference implementation of a transform function.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_transformed : array of int of shape = [n_samples, n_features]
            The array containing the element-wise square roots of the values
            in `X`
        """
        # Check is fit had been called
        check_is_fitted(self, ['input_shape_'])

        # Input validation
        X = check_array(X)

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape != self.input_shape_:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')
        return np.sqrt(X)
Exemplo n.º 22
0
def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None):

    y = np.asarray(y)
    if y.ndim != 1:
        raise ValueError("expected y of shape (n_samples,), got %r"
                         % (y.shape,))

    Xval = check_array(X, accept_sparse='csr')
    if Xval.shape[0] != y.shape[0]:
        raise ValueError("X.shape[0] and y.shape[0] should be the same, got"
                         " %r and %r instead." % (Xval.shape[0], y.shape[0]))

    # We had some issues with CSR matrices with unsorted indices (e.g. #1501),
    # so sort them here, but first make sure we don't modify the user's X.
    # TODO We can do this cheaper; sorted_indices copies the whole matrix.
    if Xval is X and hasattr(Xval, "sorted_indices"):
        X = Xval.sorted_indices()
    else:
        X = Xval
        if hasattr(X, "sort_indices"):
            X.sort_indices()

    if query_id is not None:
        query_id = np.asarray(query_id)
        if query_id.shape[0] != y.shape[0]:
            raise ValueError("expected query_id of shape (n_samples,), got %r"
                             % (query_id.shape,))

    one_based = not zero_based

    if hasattr(f, "write"):
        _dump_svmlight(X, y, f, one_based, comment, query_id)
    else:
        with open(f, "wb") as f:
            _dump_svmlight(X, y, f, one_based, comment, query_id)
Exemplo n.º 23
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'std_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False)
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]

            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")

            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
Exemplo n.º 24
0
def ttest(X, y):
    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = MultiLabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)
    negY = 1- Y
    labelNum = Y.shape[1]
#     sampleNum = Y.shape[0]
    featureNum = X.shape[1]
    t = []
    prob = []
    for i in range(featureNum):
        values = X[:,i].T.toarray()[0]
        ti = 0
        probi = 0
        for j in range(labelNum):
            observed = values * Y[:,j]
            notObserved = values * negY[:,j]
            (res0, res1) = scipy.stats.ttest_ind(observed, notObserved)
            ti = ti + res0
            probi = probi + res1
        t.append(ti)
        prob.append(probi)
    t = np.asarray(t)
    prob = np.asarray(prob)
    return t, prob
Exemplo n.º 25
0
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
                                            algorithm=self.algorithm,
                                            leaf_size=self.leaf_size,
                                            metric=self.metric,
                                            p=self.p,
                                            metric_params=self.metric_params,
                                            contamination=self.contamination,
                                            n_jobs=self.n_jobs)
        self.detector_.fit(X=X, y=y)

        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = invert_order(
            self.detector_.negative_outlier_factor_)
        self._process_decision_scores()
        return self
    def predict_proba(self, X):
        """Predict probability for each possible outcome.

        Compute the probability estimates for each single sample in X
        and each possible outcome seen during training (categorical
        distribution).

        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]

        Returns
        -------
        probabilities : array, shape = [n_samples, n_classes]
            Normalized probability distributions across
            class labels
        """
        check_is_fitted(self, 'X_')

        X_2d = check_array(X, accept_sparse = ['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        weight_matrices = self._get_kernel(self.X_, X_2d)
        if self.kernel == 'knn':
            probabilities = []
            for weight_matrix in weight_matrices:
                ine = np.sum(self.label_distributions_[weight_matrix], axis=0)
                probabilities.append(ine)
            probabilities = np.array(probabilities)
        else:
            weight_matrices = weight_matrices.T
            probabilities = np.dot(weight_matrices, self.label_distributions_)
        normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
        probabilities /= normalizer
        return probabilities
Exemplo n.º 27
0
    def fit(self, X, y=None):
        """Fit the model with ``X``.
        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.
        Returns
        -------
        self : object
            Returns the instance itself.
        """

        X = check_array(X, dtype=np.float)
        L, S, (U, s, Vt), self.n_iter_ = rpca(X, self.lam, self.mu,
                                              self.max_iter, self.eps_primal,
                                              self.eps_dual, self.rho,
                                              self.initial_sv, self.max_mu,
                                              self.verbose)
        self.low_rank_ = L
        r = np.count_nonzero(s)
        self.n_components_ = r
        self.components_ = Vt[:r]

        return self
Exemplo n.º 28
0
Arquivo: slm.py Projeto: NICTA/revrand
    def predict_moments(self, X):
        """
        Full predictive distribution from Bayesian linear regression.

        Parameters
        ----------
        X : ndarray
            (N*,d) array query input dataset (N* samples, d dimensions).

        Returns
        -------
        Ey : ndarray
            The expected value of y* for the query inputs, X* of shape (N*,).
        Vy : ndarray
            The expected variance of y* for the query inputs, X* of shape
            (N*,).
        """
        check_is_fitted(self, ['var_', 'regularizer_', 'weights_',
                               'covariance_', 'hypers_'])
        X = check_array(X)

        Phi = self.basis.transform(X, *atleast_list(self.hypers_))
        Ey = Phi.dot(self.weights_)
        Vf = (Phi.dot(self.covariance_) * Phi).sum(axis=1)

        return Ey, Vf + self.var_
Exemplo n.º 29
0
    def fit(self, X, y):
        X = check_array(X)

        random_state = check_random_state(self.random_state)

        self.classes_, y_reverse = np.unique(y, return_inverse=True)

        if np.nan in self.classes_:
            raise ValueError("NaN class not supported.")

        # build models
        models = {}
        for c_idx, c_value in enumerate(self.classes_):
            X_class = X[y == c_value]
            a_sample_size = min(len(X_class), self.sample_size)
            c_models = []
            for i in range(self.n_models):
                # resample
                X_sample = X_class[random_state.choice(len(X_class), a_sample_size)]
                c_models.append(self.build_for_class(random_state, X_sample))
            models[c_value] = np.array(c_models)

        weights = self.fit_weights(random_state, models, X, y_reverse)

        self.models_ = models
        self.weights_ = weights

        return self
Exemplo n.º 30
0
    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['components_', 'w_components_'])

        X = check_array(X)
        if self.standardization:
            X = self.scaler_.transform(X)

        return np.sum(
            cdist(X, self.selected_components_) / self.selected_w_components_,
            axis=1).ravel()
Exemplo n.º 31
0
 def preprocess_data(self, X):
     X = check_array(X,
                     dtype=[np.float64, np.float32],
                     ensure_min_samples=1)
     X2 = row_norms(X, squared=True)
     return X, X2
Exemplo n.º 32
0
 def fit(self, X, y=None):
     self.X_shape_ = check_array(X).shape
     return self
Exemplo n.º 33
0
def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
                 gamma, init, n_init, verbose):
    """k-prototypes algorithm"""

    if sparse.issparse(X):
        raise TypeError("k-prototypes does not support sparse data.")

    if categorical is None or not categorical:
        raise NotImplementedError(
            "No categorical data selected, effectively doing k-means. "
            "Present a list of categorical columns, or use scikit-learn's "
            "KMeans instead."
        )
    if isinstance(categorical, int):
        categorical = [categorical]
    assert len(categorical) != X.shape[1], \
        "All columns are categorical, use k-modes instead of k-prototypes."
    assert max(categorical) < X.shape[1], \
        "Categorical index larger than number of columns."

    ncatattrs = len(categorical)
    nnumattrs = X.shape[1] - ncatattrs
    npoints = X.shape[0]
    assert n_clusters <= npoints, "More clusters than data points?"

    Xnum, Xcat = _split_num_cat(X, categorical)
    Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)

    # Convert the categorical values in Xcat to integers for speed.
    # Based on the unique values in Xcat, we can make a mapping to achieve this.
    Xcat, enc_map = encode_features(Xcat)

    # Are there more n_clusters than unique rows? Then set the unique
    # rows as initial values and skip iteration.
    unique = get_unique_rows(X)
    n_unique = unique.shape[0]
    if n_unique <= n_clusters:
        max_iter = 0
        n_init = 1
        n_clusters = n_unique
        init = list(_split_num_cat(unique, categorical))
        init[1], _ = encode_features(init[1], enc_map)

    # Estimate a good value for gamma, which determines the weighing of
    # categorical values in clusters (see Huang [1997]).
    if gamma is None:
        gamma = 0.5 * Xnum.std()

    all_centroids = []
    all_labels = []
    all_costs = []
    all_n_iters = []
    for init_no in range(n_init):

        # For numerical part of initialization, we don't have a guarantee
        # that there is not an empty cluster, so we need to retry until
        # there is none.
        init_tries = 0
        while True:
            init_tries += 1
            # _____ INIT _____
            if verbose:
                print("Init: initializing centroids")
            if isinstance(init, str) and init == 'Huang':
                centroids = kmodes.init_huang(Xcat, n_clusters, cat_dissim)
            elif isinstance(init, str) and init == 'Cao':
                centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim)
            elif isinstance(init, str) and init == 'random':
                seeds = np.random.choice(range(npoints), n_clusters)
                centroids = Xcat[seeds]
            elif isinstance(init, list):
                # Make sure inits are 2D arrays.
                init = [np.atleast_2d(cur_init).T if len(cur_init.shape) == 1
                        else cur_init
                        for cur_init in init]
                assert init[0].shape[0] == n_clusters, \
                    "Wrong number of initial numerical centroids in init " \
                    "({}, should be {}).".format(init[0].shape[0], n_clusters)
                assert init[0].shape[1] == nnumattrs, \
                    "Wrong number of numerical attributes in init ({}, should be {})."\
                    .format(init[0].shape[1], nnumattrs)
                assert init[1].shape[0] == n_clusters, \
                    "Wrong number of initial categorical centroids in init ({}, " \
                    "should be {}).".format(init[1].shape[0], n_clusters)
                assert init[1].shape[1] == ncatattrs, \
                    "Wrong number of categorical attributes in init ({}, should be {})."\
                    .format(init[1].shape[1], ncatattrs)
                centroids = [np.asarray(init[0], dtype=np.float64),
                             np.asarray(init[1], dtype=np.uint8)]
            else:
                raise NotImplementedError("Initialization method not supported.")

            if not isinstance(init, list):
                # Numerical is initialized by drawing from normal distribution,
                # categorical following the k-modes methods.
                meanx = np.mean(Xnum, axis=0)
                stdx = np.std(Xnum, axis=0)
                centroids = [
                    meanx + np.random.randn(n_clusters, nnumattrs) * stdx,
                    centroids
                ]

            if verbose:
                print("Init: initializing clusters")
            membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
            # Keep track of the sum of attribute values per cluster so that we
            # can do k-means on the numerical attributes.
            cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
            # cl_attr_freq is a list of lists with dictionaries that contain
            # the frequencies of values per cluster and attribute.
            cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
                            for _ in range(n_clusters)]
            for ipoint in range(npoints):
                # Initial assignment to clusters
                clust = np.argmin(
                    num_dissim(centroids[0], Xnum[ipoint]) +
                    gamma * cat_dissim(centroids[1], Xcat[ipoint])
                )
                membship[clust, ipoint] = 1
                # Count attribute values per cluster.
                for iattr, curattr in enumerate(Xnum[ipoint]):
                    cl_attr_sum[clust, iattr] += curattr
                for iattr, curattr in enumerate(Xcat[ipoint]):
                    cl_attr_freq[clust][iattr][curattr] += 1

            # If no empty clusters, then consider initialization finalized.
            if membship.sum(axis=1).min() > 0:
                break

            if init_tries == MAX_INIT_TRIES:
                # Could not get rid of empty clusters. Randomly
                # initialize instead.
                init = 'random'
            elif init_tries == RAISE_INIT_TRIES:
                raise ValueError(
                    "Clustering algorithm could not initialize. "
                    "Consider assigning the initial clusters manually."
                )

        # Perform an initial centroid update.
        for ik in range(n_clusters):
            for iattr in range(nnumattrs):
                centroids[0][ik, iattr] = \
                    cl_attr_sum[ik, iattr] / sum(membship[ik, :])
            for iattr in range(ncatattrs):
                centroids[1][ik, iattr] = \
                    get_max_value_key(cl_attr_freq[ik][iattr])

        # _____ ITERATION _____
        if verbose:
            print("Starting iterations...")
        itr = 0
        converged = False
        cost = np.Inf
        while itr <= max_iter and not converged:
            itr += 1
            centroids, moves = _k_prototypes_iter(Xnum, Xcat, centroids,
                                                  cl_attr_sum, cl_attr_freq,
                                                  membship, num_dissim, cat_dissim, gamma)

            # All points seen in this iteration
            labels, ncost = _labels_cost(Xnum, Xcat, centroids,
                                         num_dissim, cat_dissim, gamma)
            converged = (moves == 0) or (ncost >= cost)
            cost = ncost
            if verbose:
                print("Run: {}, iteration: {}/{}, moves: {}, ncost: {}"
                      .format(init_no + 1, itr, max_iter, moves, ncost))

        # Store results of current run.
        all_centroids.append(centroids)
        all_labels.append(labels)
        all_costs.append(cost)
        all_n_iters.append(itr)

    best = np.argmin(all_costs)
    if n_init > 1 and verbose:
        print("Best run was number {}".format(best + 1))

    # Note: return gamma in case it was automatically determined.
    return all_centroids[best], enc_map, all_labels[best], \
        all_costs[best], all_n_iters[best], gamma
Exemplo n.º 34
0
 def predict(self, X, threshold=0.5):
     check_is_fitted(self)
     X = check_array(X)
     return self.predict_proba(X) >= threshold
Exemplo n.º 35
0
    def is_stationary(self, x):
        """Test whether the time series is stationary.

        Parameters
        ----------
        x : array-like, shape=(n_samples,)
            The time series vector.
        """
        if not self._base_case(x):
            return np.nan, False

        # ensure vector
        x = column_or_1d(
            check_array(x, ensure_2d=False, dtype=DTYPE,
                        force_all_finite=True))  # type: np.ndarray

        # embed the vector. This is some funkiness that goes on in the R
        # code... basically, make a matrix where the column (rows if not T)
        # are lagged windows of x
        z = self._embed(x, 2)
        yt = z[0, :]
        yt1 = z[1, :]  # type: np.ndarray

        # fit a linear model to a predictor matrix
        n = yt.shape[0]
        tt = (np.arange(n) + 1) - (n / 2.0)
        X = np.array([np.ones(n), tt, yt1]).T
        res = LinearRegression().fit(X, yt)  # lm(yt ~ 1 + tt + yt1)
        coef = res.coef_

        # check for singularities - do we want to do this??? in the R code,
        # it happens. but the very same lm in the R code is rank 3, and here
        # it is rank 2. Should we just ignore?...
        # if res.rank_ < 3:
        #     raise ValueError('singularities in regression')

        u = yt - res.predict(X)  # residuals
        ssqru = (u * u).sum() / float(n)

        scalar = 12 if not self.lshort else 4
        l = int(np.trunc(scalar * np.power(n / 100.0, 0.25)))
        ssqrtl = C_tseries_pp_sum(u, n, l, ssqru)

        # define trm vals
        n2 = n * n
        syt11n = (yt1 * (np.arange(n) + 1)).sum()  # sum(yt1*(1:n))
        trm1 = n2 * (n2 - 1) * (yt1**2).sum() / 12.0

        # R code: # n*sum(yt1*(1:n))^2
        trm2 = n * (syt11n**2)

        # R code: n*(n+1)*sum(yt1*(1:n))*sum(yt1)
        trm3 = n * (n + 1) * syt11n * yt1.sum()
        trm4 = (n * (n + 1) * (2 * n + 1) * (yt1.sum()**2)) / 6.0
        dx = trm1 - trm2 + trm3 - trm4

        # if self.typ == 'alpha':
        alpha = coef[2]  # it's the last col...
        STAT = n * (alpha - 1) - (n**6) / (24.0 * dx) * (ssqrtl - ssqru)

        table = -np.array([
            c(22.5, 25.7, 27.4, 28.4, 28.9, 29.5),
            c(19.9, 22.4, 23.6, 24.4, 24.8, 25.1),
            c(17.9, 19.8, 20.7, 21.3, 21.5, 21.8),
            c(15.6, 16.8, 17.5, 18.0, 18.1, 18.3),
            c(3.66, 3.71, 3.74, 3.75, 3.76, 3.77),
            c(2.51, 2.60, 2.62, 2.64, 2.65, 2.66),
            c(1.53, 1.66, 1.73, 1.78, 1.78, 1.79),
            c(0.43, 0.65, 0.75, 0.82, 0.84, 0.87)
        ]).T

        tablen = table.shape[1]
        tableT = c(25, 50, 100, 250, 500, 100000).astype(DTYPE)
        tablep = c(0.01, 0.025, 0.05, 0.10, 0.90, 0.95, 0.975, 0.99)
        tableipl = np.zeros(tablen)

        for i in range(tablen):
            _, pval = approx(tableT, table[:, i], xout=n, rule=2)
            tableipl[i] = pval

        # make sure to do 1 - x...
        _, interpol = approx(tableipl, tablep, xout=STAT, rule=2)
        pval = 1 - interpol[0]

        # in the R code, here is where the P value warning is tested again...
        return pval, pval < self.alpha
Exemplo n.º 36
0
    def _validate_train_parms(self, train_set, train_lab, classes=None):
        random_state = validation.check_random_state(self.random_state)
        train_set, train_lab = validation.check_X_y(train_set,
                                                    train_lab.ravel())

        if (self.initial_fit):
            if (classes):
                self.classes_ = np.asarray(classes)
                self.protos_initialized = np.zeros(self.classes_.size)
            else:
                self.classes_ = unique_labels(train_lab)
                self.protos_initialized = np.zeros(self.classes_.size)

        nb_classes = len(self.classes_)
        nb_samples, nb_features = train_set.shape  # nb_samples unused

        # set prototypes per class
        if isinstance(self.prototypes_per_class, int):
            if self.prototypes_per_class < 0 or not isinstance(
                    self.prototypes_per_class, int):
                raise ValueError("prototypes_per_class must be a positive int")
            # nb_ppc = number of protos per class
            nb_ppc = np.ones([nb_classes],
                             dtype='int') * self.prototypes_per_class
        else:
            nb_ppc = validation.column_or_1d(
                validation.check_array(self.prototypes_per_class,
                                       ensure_2d=False,
                                       dtype='int'))
            if nb_ppc.min() <= 0:
                raise ValueError(
                    "values in prototypes_per_class must be positive")
            if nb_ppc.size != nb_classes:
                raise ValueError("length of prototypes per class"
                                 " does not fit the number of classes"
                                 "classes=%d"
                                 "length=%d" % (nb_classes, nb_ppc.size))

        # initialize prototypes
        if self.initial_prototypes is None:
            if self.initial_fit:
                self.w_ = np.empty([np.sum(nb_ppc), nb_features],
                                   dtype=np.double)
                self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype)
            pos = 0
            for actClassIdx in range(len(self.classes_)):
                actClass = self.classes_[actClassIdx]
                nb_prot = nb_ppc[actClassIdx]  # nb_ppc: prototypes per class
                if (self.protos_initialized[actClassIdx] == 0
                        and actClass in unique_labels(train_lab)):
                    mean = np.mean(train_set[train_lab == actClass, :], 0)
                    self.w_[pos:pos + nb_prot] = mean + (
                        random_state.rand(nb_prot, nb_features) * 2 - 1)
                    if math.isnan(self.w_[pos, 0]):
                        print('Prototype is NaN: ', actClass)
                        self.protos_initialized[actClassIdx] = 0
                    else:
                        self.protos_initialized[actClassIdx] = 1

                    self.c_w_[pos:pos + nb_prot] = actClass
                pos += nb_prot
        else:
            x = validation.check_array(self.initial_prototypes)
            self.w_ = x[:, :-1]
            self.c_w_ = x[:, -1]
            if self.w_.shape != (np.sum(nb_ppc), nb_features):
                raise ValueError("the initial prototypes have wrong shape\n"
                                 "found=(%d,%d)\n"
                                 "expected=(%d,%d)" %
                                 (self.w_.shape[0], self.w_.shape[1],
                                  nb_ppc.sum(), nb_features))
            if set(self.c_w_) != set(self.classes_):
                raise ValueError(
                    "prototype labels and test data classes do not match\n"
                    "classes={}\n"
                    "prototype labels={}\n".format(self.classes_, self.c_w_))
        if self.initial_fit:
            # Next two lines are Init for Adadelta/RMSprop
            self.squared_mean_gradient = np.zeros_like(self.w_)
            self.squared_mean_step = np.zeros_like(self.w_)
            self.initial_fit = False

        return train_set, train_lab, random_state
Exemplo n.º 37
0
    def kneighbors(
            self,
            X=None,
            n_candidates=None,
            return_distance=True
    ) -> Union[Tuple[np.array, np.array], np.array]:
        """ Retrieve k nearest neighbors.

        Parameters
        ----------
        X: np.array or None, optional, default = None
            Query objects. If None, search among the indexed objects.
        n_candidates: int or None, optional, default = None
            Number of neighbors to retrieve.
            If None, use the value passed during construction.
        return_distance: bool, default = True
            If return_distance, will return distances and indices to neighbors.
            Else, only return the indices.
        """
        check_is_fitted(self, 'index_')
        if X is not None:
            X = check_array(X)

        n_test = self.n_samples_fit_ if X is None else X.shape[0]
        dtype = self.X_dtype_ if X is None else X.dtype

        if n_candidates is None:
            n_candidates = self.n_candidates
        n_candidates = check_n_candidates(n_candidates)

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        if X is None:
            n_neighbors = n_candidates + 1
            start = 1
        else:
            n_neighbors = n_candidates
            start = 0

        # If fewer candidates than required are found for a query,
        # we save index=-1 and distance=NaN
        neigh_ind = -np.ones((n_test, n_candidates), dtype=np.int32)
        if return_distance:
            neigh_dist = np.empty_like(neigh_ind, dtype=dtype) * np.nan

        if isinstance(self.index_, str):
            index = ngtpy.Index(self.index_)
        else:
            index = self.index_

        disable_tqdm = False if self.verbose else True
        if X is None:
            for i in tqdm(
                    range(n_test),
                    desc='Query NNG',
                    disable=disable_tqdm,
            ):
                query = index.get_object(i)
                response = index.search(
                    query=query,
                    size=n_neighbors,
                    with_distance=return_distance,
                    epsilon=self.epsilon,
                )
                if return_distance:
                    ind, dist = [np.array(arr) for arr in zip(*response)]
                else:
                    ind = response
                ind = ind[start:]
                neigh_ind[i, :len(ind)] = ind
                if return_distance:
                    dist = dist[start:]
                    neigh_dist[i, :len(dist)] = dist
        else:  # if X was provided
            for i, x in tqdm(
                    enumerate(X),
                    desc='Query NNG',
                    disable=disable_tqdm,
            ):
                response = index.search(
                    query=x,
                    size=n_neighbors,
                    with_distance=return_distance,
                    epsilon=self.epsilon,
                )
                if return_distance:
                    ind, dist = [np.array(arr) for arr in zip(*response)]
                else:
                    ind = response
                ind = ind[start:]
                neigh_ind[i, :len(ind)] = ind
                if return_distance:
                    dist = dist[start:]
                    neigh_dist[i, :len(dist)] = dist

        if return_distance and self.metric == 'sqeuclidean':
            neigh_dist **= 2

        if return_distance:
            return neigh_dist, neigh_ind
        else:
            return neigh_ind
Exemplo n.º 38
0
 def fit(self, X, y=None):
     X = check_array(X)
     return self
 def transform(self, X):
     X = check_array(X)
     return X
 def predict(self, X):
     X = check_array(X)
     return np.ones(X.shape[0])
 def predict(self, X):
     check_is_fitted(self)
     X = check_array(X)
     return np.ones(X.shape[0])
Exemplo n.º 42
0
 def transform(self, X, y=None):
     check_is_fitted(self)
     X = check_array(X)
     return X
Exemplo n.º 43
0
 def predict(self, X):
     check_is_fitted(self)
     X = check_array(X)
     return np.ones(shape=(X.shape[0],)) * self._mean
Exemplo n.º 44
0
def plot_partial_corrcoef(partial_corrcoef,
                          ax=None,
                          cbar=True,
                          figsize=None,
                          filename=None,
                          title='Partial correlation',
                          **kwargs):
    """Plot the partial correlation coefficient matrix.

    Parameters
    ----------
    partial_corrcoef : array-like of shape (n_features, n_features)
        Partial correlation coefficient matrix.

    ax : matplotlib Axes, default None
        Target axes instance.

    cbar : bool, default True.
        If True, draw a colorbar.

    figsize : tuple, default None
        Tuple denoting figure size of the plot.

    filename : str, default None
        If provided, save the current figure.

    title : string, default 'Partial correlation'
        Axes title. To disable, pass None.

    **kwargs : dict
        Other keywords passed to ``ax.pcolormesh``.

    Returns
    -------
    ax : matplotlib Axes
        Axes on which the plot was drawn.

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> from kenchi.plotting import plot_partial_corrcoef
    >>> from sklearn.datasets import make_sparse_spd_matrix
    >>> A = make_sparse_spd_matrix(dim=20, norm_diag=True, random_state=0)
    >>> plot_partial_corrcoef(A) # doctest: +ELLIPSIS
    <matplotlib.axes._subplots.AxesSubplot object at 0x...>
    >>> plt.show() # doctest: +SKIP

    .. figure:: images/plot_partial_corrcoef.png
    """

    import matplotlib.pyplot as plt
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    partial_corrcoef = check_array(partial_corrcoef)
    partial_corrcoef = check_symmetric(partial_corrcoef, raise_exception=True)

    if ax is None:
        _, ax = plt.subplots(figsize=figsize)

    if title is not None:
        ax.set_title(title)

    # Add the pcolormesh kwargs here
    kwargs.setdefault('cmap', 'RdBu')
    kwargs.setdefault('edgecolors', 'white')
    kwargs.setdefault('vmin', -1.)
    kwargs.setdefault('vmax', 1.)

    # Draw the heatmap
    mesh = ax.pcolormesh(np.ma.masked_equal(partial_corrcoef, 0.), **kwargs)

    ax.set_aspect('equal')
    ax.set_facecolor('grey')

    # Invert the y axis to show the plot in matrix form
    ax.invert_yaxis()

    if cbar:
        # Create an axes on the right side of ax
        divider = make_axes_locatable(ax)
        cax = divider.append_axes('right', '5%', pad=0.1)

        ax.get_figure().colorbar(mesh, cax=cax)

    if filename is not None:
        ax.get_figure().savefig(filename)

    return ax
Exemplo n.º 45
0
 def predict(self, X):
     if not hasattr(self, 'coef_'):
         raise CorrectNotFittedError("estimator is not fitted yet")
     X = check_array(X)
     return np.ones(X.shape[0])
Exemplo n.º 46
0
    def fit(self, X, y=None) -> NNG:
        """ Build the ngtpy.Index and insert data from X.

        Parameters
        ----------
        X: np.array
            Data to be indexed
        y: any
            Ignored

        Returns
        -------
        self: NNG
            An instance of NNG with a built index
        """
        if y is None:
            X = check_array(X)
        else:
            X, y = check_X_y(X, y)
            self.y_train_ = y

        self.n_samples_fit_ = X.shape[0]
        self.n_features_ = X.shape[1]
        self.X_dtype_ = X.dtype

        # Map common distance names to names used by ngt
        try:
            self.effective_metric_ = NNG.internal_distance_type[self.metric]
        except KeyError:
            self.effective_metric_ = self.metric
        if self.effective_metric_ not in NNG.valid_metrics:
            raise ValueError(
                f'Unknown distance/similarity measure: {self.effective_metric_}. '
                f'Please use one of: {NNG.valid_metrics}.')

        # Set up a directory to save the index to
        prefix = 'skhubness_'
        suffix = '.anng'
        if self.index_dir in ['auto']:
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory='/dev/shm')
            logging.warning(
                f'The index will be stored in {index_path}. '
                f'It will NOT be deleted automatically, when this instance is destructed.'
            )
        elif isinstance(self.index_dir, str):
            index_path = create_tempfile_preferably_in_dir(
                prefix=prefix, suffix=suffix, directory=self.index_dir)
        elif self.index_dir is None:
            index_path = create_tempfile_preferably_in_dir(prefix=prefix,
                                                           suffix=suffix)
        else:
            raise TypeError(
                f'NNG requires to write an index to the filesystem. '
                f'Please provide a valid path with parameter `index_dir`.')

        # Create the ANNG index, insert data
        ngtpy.create(
            path=index_path,
            dimension=self.n_features_,
            edge_size_for_creation=self.edge_size_for_creation,
            edge_size_for_search=self.edge_size_for_search,
            distance_type=self.effective_metric_,
        )
        index_obj = ngtpy.Index(index_path)
        index_obj.batch_insert(X, num_threads=self.n_jobs)
        index_obj.save()

        # Convert ANNG top ONNG
        if self.optimize:
            optimizer = ngtpy.Optimizer()
            optimizer.set(num_of_outgoings=self.num_outgoing,
                          num_of_incomings=self.num_incoming)
            index_path_onng = str(
                pathlib.Path(index_path).with_suffix('.onng'))
            optimizer.execute(index_path, index_path_onng)
            index_path = index_path_onng

        # Keep index in memory or store in path
        if self.index_dir is None:
            self.index_ = index_obj
        else:
            # index_obj.save()
            self.index_ = index_path

        return self
Exemplo n.º 47
0
    def fit(self, X, y=None):

        # Covariance does not make sense for a single feature
        x = check_array(X, ensure_min_features=2, estimator=self)
        n, p = x.shape

        kf = KFold(n_splits=self.folds,
                   random_state=self.random_state,
                   shuffle=self.shuffle)
        lam1n, lam2n, lam1type, lam2type = self._candidate()

        self.res = []

        for i in range(lam1n):
            if lam1type < 3:
                lam1 = self.lam1s
                self.lam1 = lam1
            else:
                lam1 = self.lam1s[i]
            lam1val = []

            for j in range(lam2n):
                if lam2type < 3:
                    lam2 = self.lam2s
                    self.lam2 = lam2
                else:
                    lam2 = self.lam2s[j]
                lam2val = []

                for train_index, test_index in kf.split(x):
                    sam_cov = np.cov(x[train_index], rowvar=False)
                    omega = graphical_concord(sam_cov=sam_cov,
                                              lam1=lam1,
                                              lam2=lam2,
                                              method=self.method,
                                              tol=self.tol,
                                              maxit=self.maxit,
                                              steptype=self.steptype,
                                              assume_scaled=self.assume_scaled)
                    cost = self._predrisk(omega, x[test_index])
                    lam2val.append(cost)

                lam1val.append(np.mean(lam2val))

            self.res.append(lam1val)

        idx = np.argwhere(self.res == np.min(self.res))
        if self.lam1 is None:
            self.lam1 = self.lam1s[idx[0][0]]
        if self.lam2 is None:
            self.lam2 = self.lam2s[idx[0][1]]

        sam_cov = np.cov(x, rowvar=False)
        self.omega = graphical_concord(sam_cov,
                                       lam1=self.lam1,
                                       lam2=self.lam2,
                                       method=self.method,
                                       tol=self.tol,
                                       maxit=self.maxit,
                                       steptype=self.steptype,
                                       assume_scaled=self.assume_scaled)

        return self
Exemplo n.º 48
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not isinstance(self.max_depth_duplication, int) \
                and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer"
                             )
        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes target class to be"
                 " labeled as 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as being from the"
                 " target class."
                 % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation."
                     % (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r"
                                 % self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names :
        feature_names_ = [BASE_FEATURE_NAME + x for x in
                          np.arange(X.shape[1]).astype(str)]
        if self.feature_names is not None:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(self.feature_names)}
        else:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(feature_names_)}
        self.feature_names_ = feature_names_

        clfs = []
        regs = []

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        for max_depth in self._max_depths:
            bagging_clf = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            bagging_reg = BaggingRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            clfs.append(bagging_clf)
            regs.append(bagging_reg)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (
                    pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
                    pow((weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        for clf in clfs:
            clf.fit(X, y)
            self.estimators_ += clf.estimators_
            self.estimators_samples_ += clf.estimators_samples_
            self.estimators_features_ += clf.estimators_features_

        for reg in regs:
            reg.fit(X, y_reg)
            self.estimators_ += reg.estimators_
            self.estimators_samples_ += reg.estimators_samples_
            self.estimators_features_ += reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~samples
            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator, np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame((X[mask, :])[:, features],
                                     columns=np.array(
                                         self.feature_names_)[features])

            if X_oob.shape[1] > 1:  # otherwise pandas bug (cf. issue #16363)
                y_oob = y[mask]
                y_oob = np.array((y_oob != 0))

                # Add OOB performances to rules:
                rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                                   for r in set(rules_from_tree)]
                rules_ += rules_from_tree

        # Factorize rules before semantic tree filtering
        rules_ = [
            tuple(rule)
            for rule in
            [Rule(r, args=args) for r, args in rules_]]

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if score[0] >= self.precision_min and score[1] >= self.recall_min:
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (
                            score[1] - self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (
                            score[0] - self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]), reverse=True)

        # Deduplicate the rule using semantic tree
        if self.max_depth_duplication is not None:
            self.rules_ = self.deduplicate(self.rules_)

        self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]

        return self
    def _predict_proba(self, X):
        """Return probability estimates for the test data X.

        Parameters
        ----------
        X : 3D numpy array dimensions (n,d,m) or (n_query, n_indexed) if metric ==
        'precomputed' Test samples.

        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            of such arrays if n_outputs > 1.
            The class probabilities of the input samples. Classes are ordered
            by lexicographic order.
        """
        self.check_is_fitted()

        if hasattr(check_array, "__wrapped__"):
            temp = check_array.__wrapped__.__code__
            check_array.__wrapped__.__code__ = _check_array_ts.__code__
        else:
            temp = check_array.__code__
            check_array.__code__ = _check_array_ts.__code__

        X = check_array(X, accept_sparse="csr")

        neigh_dist, neigh_ind = self.kneighbors(X)

        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_samples = X.shape[0]

        weights = _get_weights(neigh_dist, self.weights)
        if weights is None:
            weights = np.ones_like(neigh_ind)

        all_rows = np.arange(X.shape[0])
        probabilities = []
        for k, classes_k in enumerate(classes_):
            pred_labels = _y[:, k][neigh_ind]
            proba_k = np.zeros((n_samples, classes_k.size))

            # a simple ':' index doesn't work right
            for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
                proba_k[all_rows, idx] += weights[:, i]

            # normalize 'votes' into real [0,1] probabilities
            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
            normalizer[normalizer == 0.0] = 1.0
            proba_k /= normalizer

            probabilities.append(proba_k)

        if not self.outputs_2d_:
            probabilities = probabilities[0]

        if hasattr(check_array, "__wrapped__"):
            check_array.__wrapped__.__code__ = temp
        else:
            check_array.__code__ = temp
        return probabilities
Exemplo n.º 50
0
 def predict(self, X):
     check_is_fitted(self, 'coeffs_')
     X = check_array(X)
     recoded_X = self._recode(X)
     return np.dot(recoded_X, self.coeffs_)
 def predict(self, X):
     # return 1 if X has more than one element else return 0
     X = check_array(X)
     if X.shape[0] > 1:
         return np.ones(X.shape[0])
     return np.zeros(X.shape[0])
    def kneighbors(self, X, n_neighbors=None, return_distance=True):
        """Find the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])
        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]
        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the lengths to points, only present if
            return_distance=True
        ind : array
            Indices of the nearest points in the population matrix.
        """
        self.check_is_fitted()
        # Transpose to work correctly with distance functions
        X = X.transpose((0, 2, 1))

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
        else:
            if not np.issubdtype(type(n_neighbors), np.integer):
                raise TypeError("n_neighbors does not take %s value, "
                                "enter integer value" % type(n_neighbors))

        if X is not None:
            query_is_train = False
            X = check_array(X, accept_sparse="csr", allow_nd=True)
        else:
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later
            n_neighbors += 1

        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError("Expected n_neighbors <= n_samples, "
                             " but n_samples = %d, n_neighbors = %d" %
                             (train_size, n_neighbors))
        n_samples = X.shape[0]
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = effective_n_jobs(self.n_jobs)
        if self._fit_method == "brute":

            reduce_func = partial(
                self._kneighbors_reduce_func,
                n_neighbors=n_neighbors,
                return_distance=return_distance,
            )

            # for efficiency, use squared euclidean distances
            kwds = ({
                "squared": True
            } if self.effective_metric_ == "euclidean" else
                    self.effective_metric_params_)

            result = pairwise_distances_chunked(X,
                                                self._fit_X,
                                                reduce_func=reduce_func,
                                                metric=self.effective_metric_,
                                                n_jobs=n_jobs,
                                                **kwds)
        else:
            raise ValueError("internal: _fit_method not recognized")

        if return_distance:
            dist, neigh_ind = zip(*result)
            result = np.vstack(dist), np.vstack(neigh_ind)
        else:
            result = np.vstack(result)

        if not query_is_train:
            return result
        else:
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
            else:
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(neigh_ind[sample_mask],
                                   (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(dist[sample_mask],
                                  (n_samples, n_neighbors - 1))
                return dist, neigh_ind
            return neigh_ind
Exemplo n.º 53
0
    def _optimize(self, x, y, random_state):
        if not isinstance(self.regularization,
                          float) or self.regularization < 0:
            raise ValueError("regularization must be a positive float ")
        nb_prototypes, nb_features = self.w_.shape
        if self.initialdim is None:
            self.dim_ = nb_features
        elif not isinstance(self.initialdim, int) or self.initialdim <= 0:
            raise ValueError("dim must be an positive int")
        else:
            self.dim_ = self.initialdim

        if self.initial_matrix is None:
            if self.dim_ == nb_features:
                self.omega_ = np.eye(nb_features)
            else:
                self.omega_ = random_state.rand(self.dim_, nb_features) * 2 - 1
        else:
            self.omega_ = validation.check_array(self.initial_matrix)
            if self.omega_.shape[1] != nb_features:
                raise ValueError(
                    "initial matrix has wrong number of features\n"
                    "found=%d\n"
                    "expected=%d" % (self.omega_.shape[1], nb_features))

        variables = np.append(self.w_, self.omega_, axis=0)
        label_equals_prototype = y
        method = 'l-bfgs-b'
        method = 'bfgs'
        res = minimize(
            fun=lambda vs:
            self._optfun(vs, x, label_equals_prototype=y),
            jac=lambda vs:
            self._optgrad(vs, x, label_equals_prototype=y,
                          random_state=random_state,
                          lr_prototypes=1, lr_relevances=0),
            method=method, x0=variables,
            options={'disp': self.display, 'gtol': self.gtol,
                     'maxiter': self.max_iter})
        n_iter = res.nit
        res = minimize(
            fun=lambda vs:
            self._optfun(vs, x, label_equals_prototype=label_equals_prototype),
            jac=lambda vs:
            self._optgrad(vs, x, label_equals_prototype=label_equals_prototype,
                          random_state=random_state,
                          lr_prototypes=0, lr_relevances=1),
            method=method, x0=res.x,
            options={'disp': self.display, 'gtol': self.gtol,
                     'maxiter': self.max_iter})
        n_iter = max(n_iter, res.nit)
        res = minimize(
            fun=lambda vs:
            self._optfun(vs, x, label_equals_prototype=label_equals_prototype),
            jac=lambda vs:
            self._optgrad(vs, x, label_equals_prototype=label_equals_prototype,
                          random_state=random_state,
                          lr_prototypes=1, lr_relevances=1),
            method=method, x0=res.x,
            options={'disp': self.display, 'gtol': self.gtol,
                     'maxiter': self.max_iter})
        n_iter = max(n_iter, res.nit)
        out = res.x.reshape(res.x.size // nb_features, nb_features)
        self.w_ = out[:nb_prototypes]
        self.omega_ = out[nb_prototypes:]
        self.omega_ /= np.math.sqrt(
            np.sum(np.diag(self.omega_.T.dot(self.omega_))))
        self.n_iter_ = n_iter
Exemplo n.º 54
0
    def transform(self, X, is_train_set=None):
        """Transform (predict) given data set.
        If ``X`` is train set:
            for each estimator return out-of-fold predictions (OOF).
        If ``X`` is any other set:
            variant A: for each estimator return mean (mode) of predictions
                made in each fold
            variant B: for each estimator return single prediction

        Parameters
        ----------
        X : 2d numpy array or sparse matrix of shape [n_samples, n_features]
            Input data

        is_train_set : boolean, default None
            Fallback parameter. In general case
                should not be used (should be None).
            Gives ability to explicitly specify that given dataset
                is train set or other set.

        Returns
        -------
        X_transformed : 2d numpy array of shape [n_samples, n_estimators] or
                        [n_samples, n_estimators * n_classes]
            Out-of-fold predictions (OOF) for train set.
            Regular or bagged predictions for any other set.
            This is stacked features for next level.
        """
        # Check if fitted
        check_is_fitted(self, ['models_A_'])

        # Input validation
        # ``check_estimator`` does not allow ``force_all_finite=False``
        X = check_array(X, accept_sparse=['csr'], force_all_finite=True)

        # *********************************************************************
        # Fitted StackingTransformer instance is bound to train set used for fitting.
        # So during transformation we have different actions for train set
        # and all other sets
        # *********************************************************************

        if is_train_set is None:
            is_train_set = self._check_identity(X)

        # Print
        if self.verbose > 0:
            if is_train_set:
                print('Train set was detected.')
            print('Transforming...\n')

        # *********************************************************************
        # Transform train set
        # *********************************************************************
        if is_train_set:

            # In case if user directly tells that it is train set but shape is different
            if self.train_shape_ != X.shape:
                raise ValueError('Train set must have the same shape '
                                 'in order to be transformed.')

            # Create empty numpy array for train predictions (OOF)
            S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_))

            # -----------------------------------------------------------------
            # MAIN TRANSFORM (PREDICT) PROCEDURE for train set
            # -----------------------------------------------------------------
            # Loop across estimators
            # -----------------------------------------------------------------
            for estimator_counter, (name, estimator) in enumerate(self.estimators_):
                if self.verbose > 0:
                    estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__)
                    print(estimator_str)

                # -------------------------------------------------------------
                # Loop across folds
                # -------------------------------------------------------------
                for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, self._y_)):
                    # Split data
                    # X_tr = X[tr_index]
                    X_te = X[te_index]

                    # Predict out-of-fold part of train set
                    if 'predict_proba' == self.action_:
                        col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_,
                                                    estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_)
                    else:
                        col_slice_estimator = estimator_counter
                    S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter],
                                                                                    None, None,
                                                                                    X_te, action=self.action_,
                                                                                    transform=self.transform_pred)
                    # Print
                    if self.verbose > 1:
                        fold_str = '    model from fold %2d: done' % fold_counter
                        print(fold_str)

                if self.verbose > 1:
                    sep_str = '    ----'
                    print(sep_str)

                if self.verbose > 0:
                    done_str = '    DONE\n'
                    print(done_str)

            # -----------------------------------------------------------------
            # Cast class labels to int
            # -----------------------------------------------------------------
            if not self.regression and not self.needs_proba:
                S_train = S_train.astype(int)

            # Return transformed data (OOF)
            return S_train  # X_transformed

        # *********************************************************************
        # Transform any other set
        # *********************************************************************
        else:
            # Check n_features
            if X.shape[1] != self.n_features_:
                raise ValueError('Inconsistent number of features.')

            # Create empty numpy array for test predictions
            S_test = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_))

            # ---------------------------------------------------------------------
            # MAIN TRANSFORM (PREDICT) PROCEDURE for any other set
            # -----------------------------------------------------------------
            # Loop across estimators
            # -----------------------------------------------------------------
            for estimator_counter, (name, estimator) in enumerate(self.estimators_):
                if self.verbose > 0:
                    estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__)
                    print(estimator_str)
                # -------------------------------------------------------------
                # Variant A
                # -------------------------------------------------------------
                if self.variant in ['A']:
                    # Create empty numpy array, which will contain temporary predictions
                    # for test set made in each fold
                    S_test_temp = np.zeros((X.shape[0], self.n_folds * self.n_classes_implicit_))
                    # ---------------------------------------------------------
                    # Loop across fitted models (it is the same as loop across folds)
                    # ---------------------------------------------------------
                    for fold_counter, model in enumerate(self.models_A_[estimator_counter]):
                        # Predict test set in each fold
                        if 'predict_proba' == self.action_:
                            col_slice_fold = slice(fold_counter * self.n_classes_implicit_,
                                                   fold_counter * self.n_classes_implicit_ + self.n_classes_implicit_)
                        else:
                            col_slice_fold = fold_counter
                        S_test_temp[:, col_slice_fold] = self._estimator_action(model, None, None, X,
                                                                                action=self.action_,
                                                                                transform=self.transform_pred)
                        # Print
                        if self.verbose > 1:
                            fold_str = '    model from fold %2d: done' % fold_counter
                            print(fold_str)

                    if self.verbose > 1:
                        sep_str = '    ----'
                        print(sep_str)

                    # ---------------------------------------------------------
                    # Compute mean or mode (majority voting) of predictions for test set
                    # ---------------------------------------------------------
                    if 'predict_proba' == self.action_:
                        # Here we copute means of probabilirties for each class
                        for class_id in range(self.n_classes_implicit_):
                            S_test[:, estimator_counter * self.n_classes_implicit_ + class_id] = np.mean(S_test_temp[:, class_id::self.n_classes_implicit_], axis=1)
                    else:
                        if self.regression:
                            S_test[:, estimator_counter] = np.mean(S_test_temp, axis=1)
                        else:
                            S_test[:, estimator_counter] = st.mode(S_test_temp, axis=1)[0].ravel()

                    if self.verbose > 0:
                        done_str = '    DONE\n'
                        print(done_str)

                # -------------------------------------------------------------
                # Variant B
                # -------------------------------------------------------------
                else:
                    if 'predict_proba' == self.action_:
                        col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_,
                                                    estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_)
                    else:
                        col_slice_estimator = estimator_counter
                    S_test[:, col_slice_estimator] = self._estimator_action(self.models_B_[estimator_counter],
                                                                            None, None, X,
                                                                            action=self.action_,
                                                                            transform=self.transform_pred)

                    if self.verbose > 0:
                        done_str = '    DONE\n'
                        print(done_str)

            # ---------------------------------------------------------------------
            # Cast class labels to int
            # ---------------------------------------------------------------------
            if not self.regression and not self.needs_proba:
                S_test = S_test.astype(int)

            return S_test  # X_transformed
 def transform(self, X):
     X = check_array(X)
     if X.shape[1] != self.X_shape_[1]:
         raise ValueError('Bad number of features')
     return sp.csr_matrix(X)
Exemplo n.º 56
0
def check_array(array, *args, **kwargs):
    """Validate inputs

    Parameters
    ----------
    accept_dask_array : bool, default True
    accept_dask_dataframe : bool, default False
    accept_unknown_chunks : bool, default False
        For dask Arrays, whether to allow the `.chunks` attribute to contain
        any unknown values
    accept_multiple_blocks : bool, default False
        For dask Arrays, whether to allow multiple blocks along the second
        axis.
    *args, **kwargs : tuple, dict
        Passed through to scikit-learn

    Returns
    -------
    array : obj
        Same type as the input

    Notes
    -----
    For dask.array, a small numpy array emulating ``array`` is created
    and passed to scikit-learn's ``check_array`` with all the additional
    arguments.
    """
    accept_dask_array = kwargs.pop("accept_dask_array", True)
    preserve_pandas_dataframe = kwargs.pop("preserve_pandas_dataframe", False)
    accept_dask_dataframe = kwargs.pop("accept_dask_dataframe", False)
    accept_unknown_chunks = kwargs.pop("accept_unknown_chunks", False)
    accept_multiple_blocks = kwargs.pop("accept_multiple_blocks", False)

    if isinstance(array, da.Array):
        if not accept_dask_array:
            raise TypeError
        if not accept_unknown_chunks:
            if np.isnan(array.shape[0]):
                raise TypeError(
                    "Cannot operate on Dask array with unknown chunk sizes."
                )
        if not accept_multiple_blocks and array.ndim > 1:
            if len(array.chunks[1]) > 1:
                msg = (
                    "Chunking is only allowed on the first axis. "
                    "Use 'array.rechunk({1: array.shape[1]})' to "
                    "rechunk to a single block along the second axis."
                )
                raise TypeError(msg)

        # hmmm, we want to catch things like shape errors.
        # I'd like to make a small sample somehow
        shape = array.shape
        if len(shape) == 2:
            shape = (min(10, shape[0]), shape[1])
        elif shape == 1:
            shape = min(10, shape[0])

        sample = np.ones(shape=shape, dtype=array.dtype)
        sk_validation.check_array(sample, *args, **kwargs)
        return array

    elif isinstance(array, dd.DataFrame):
        if not accept_dask_dataframe:
            raise TypeError("This estimator does not support dask dataframes.")
        # TODO: sample?
        return array
    elif isinstance(array, pd.DataFrame) and preserve_pandas_dataframe:
        # TODO: validation?
        return array
    else:
        return sk_validation.check_array(array, *args, **kwargs)
 def predict(self, X):
     X = check_array(X)
     return np.array([self.value_] * X.shape[0])
 def predict(self, X):
     X = check_array(X)
     self.key = 1000
     return np.ones(X.shape[0])
Exemplo n.º 59
0
 def fit(self, X, y=None):
     check_array(X)
     self.is_fitted_ = True
     return self
Exemplo n.º 60
0
    def is_stationary(self, x):
        """Test whether the time series is stationary.

        Parameters
        ----------
        x : array-like, shape=(n_samples,)
            The time series vector.
        """
        if not self._base_case(x):
            return np.nan, False

        # ensure vector
        x = column_or_1d(
            check_array(x, ensure_2d=False, dtype=DTYPE,
                        force_all_finite=True))  # type: np.ndarray

        # if k is none...
        k = self.k
        if k is None:
            k = np.trunc(np.power(x.shape[0] - 1, 1 / 3.0))

        k = int(k) + 1
        y = diff(x)
        n = y.shape[0]
        z = self._embed(y, k)
        yt = z[0, :]
        tt = np.arange(k - 1, n)

        # R does [k:n].. but that's 1-based indexing and inclusive on the tail
        xt1 = x[tt]

        # make tt inclusive again (it was used as a mask before)
        tt += 1

        # the array that will create the LM:
        _n = xt1.shape[0]
        X = np.hstack([
            xt1.reshape((_n, 1)),
            np.ones(_n).reshape((_n, 1)),
            tt.reshape((_n, 1))
        ])

        if k > 1:
            yt1 = z[1:k, :]  # R had 2:k
            X = np.hstack([X, yt1.T])

        # fit the linear regression - this one is a bit strange in that we
        # are using OLS from statsmodels rather than LR from sklearn. This is
        # because we need the std errors, and sklearn does not have a way to
        # store them.
        res = sm.OLS(yt, X).fit()
        STAT = res.params[0] / res.HC0_se[0]  # FIXME: is the denom correct?...
        table = -np.array([
            c(4.38, 4.15, 4.04, 3.99, 3.98, 3.96),
            c(3.95, 3.80, 3.73, 3.69, 3.68, 3.66),
            c(3.60, 3.50, 3.45, 3.43, 3.42, 3.41),
            c(3.24, 3.18, 3.15, 3.13, 3.13, 3.12),
            c(1.14, 1.19, 1.22, 1.23, 1.24, 1.25),
            c(0.80, 0.87, 0.90, 0.92, 0.93, 0.94),
            c(0.50, 0.58, 0.62, 0.64, 0.65, 0.66),
            c(0.15, 0.24, 0.28, 0.31, 0.32, 0.33)
        ]).T

        tablen = table.shape[1]
        tableT = c(25, 50, 100, 250, 500, 100000)
        tablep = c(0.01, 0.025, 0.05, 0.10, 0.90, 0.95, 0.975, 0.99)

        tableipl = np.zeros(tablen)
        for i in range(tablen):
            _, pval = approx(tableT, table[:, i], xout=n, rule=2)
            tableipl[i] = pval

        # make sure to do 1 - x...
        _, interpol = approx(tableipl, tablep, xout=STAT, rule=2)
        pval = 1 - interpol[0]

        # in the R code, here is where the P value warning is tested again...
        return pval, pval < self.alpha