Пример #1
0
    def fit(self, Z, classes=None):
        """
        TODO fulibacsi fix docstring
        Fit Multinomial Naive Bayes according to (X,y) pair
        which is zipped into TupleRDD Z.

        Parameters
        ----------
        Z : TupleRDD containing X [array-like, shape (m_samples, n_features)]
            and y [array-like, shape (m_samples,)] tuple
            Training vectors, where ,_samples is the number of samples in the
            block and n_features is the number of features, and y contains
            the target values.

        Returns
        -------
        self : object
            Returns self.
        """
        check_rdd(Z, {
            'X': (sp.spmatrix, np.ndarray),
            'y': (sp.spmatrix, np.ndarray)
        })
        if 'w' in Z.columns:
            models = Z[:, ['X', 'y', 'w']].map(lambda X_y_w: self.partial_fit(
                X_y_w[0], X_y_w[1], classes, X_y_w[2]))
        else:
            models = Z[:, ['X', 'y']].map(
                lambda X_y: self.partial_fit(X_y[0], X_y[1], classes))
        avg = models.sum()
        self.__dict__.update(avg.__dict__)
        return self
Пример #2
0
    def fit(self, Z, classes=None):
        """
        TODO fulibacsi fix docstring
        Fit Multinomial Naive Bayes according to (X,y) pair
        which is zipped into TupleRDD Z.

        Parameters
        ----------
        Z : TupleRDD containing X [array-like, shape (m_samples, n_features)]
            and y [array-like, shape (m_samples,)] tuple
            Training vectors, where ,_samples is the number of samples in the
            block and n_features is the number of features, and y contains
            the target values.

        Returns
        -------
        self : object
            Returns self.
        """
        check_rdd(Z, {'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray)})
        if 'w' in Z.columns:
            models = Z[:, ['X', 'y', 'w']].map(
                lambda X_y_w: self.partial_fit(
                    X_y_w[0], X_y_w[1], classes, X_y_w[2]
                )
            )
        else:
            models = Z[:, ['X', 'y']].map(
                lambda X_y: self.partial_fit(X_y[0], X_y[1], classes))
        avg = models.sum()
        self.__dict__.update(avg.__dict__)
        return self
Пример #3
0
    def fit(self, Z, classes=None):
        """Fit Gaussian Naive Bayes according to X, y

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target values.

        Returns
        -------
        self : object
            Returns self.
        """
        check_rdd(Z, {
            'X': (sp.spmatrix, np.ndarray),
            'y': (sp.spmatrix, np.ndarray)
        })
        models = Z[:, ['X', 'y']].map(
            lambda X_y: self.partial_fit(X_y[0], X_y[1], classes))
        avg = models.reduce(operator.add)
        self.__dict__.update(avg.__dict__)
        return self
Пример #4
0
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the RDD containing the
        test vector X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with array-like items, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model for each RDD block. The columns correspond to the classes
            in sorted order, as they appear in the attribute `classes_`.
        """
        # required, scikit call self.predict_log_proba(X) in predict_proba
        # and thus this function is call, it must have the same behavior when
        # not called by sparkit-learn
        if not isinstance(X, BlockRDD):
            return super(SparkBaseNB, self).predict_log_proba(X)

        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(
            lambda X: super(SparkBaseNB, self).predict_log_proba(X))
Пример #5
0
    def predict(self, X):
        """
        Perform classification on an RDD containing arrays of test vectors X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with arrays, shape = [n_samples]
            Predicted target values for X
        """
        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(lambda X: super(SparkBaseNB, self).predict(X))
Пример #6
0
    def predict(self, X):
        """
        Perform classification on an RDD containing arrays of test vectors X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with arrays, shape = [n_samples]
            Predicted target values for X
        """
        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(
            lambda X: super(SparkBaseNB, self).predict(X))
Пример #7
0
    def predict_proba(self, X):
        """
        Return probability estimates for the RDD containing test vector X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with array-like items , shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the models for each RDD block. The columns correspond to the classes
            in sorted order, as they appear in the attribute `classes_`.
        """
        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(lambda X: super(SparkBaseNB, self).predict_proba(X))
Пример #8
0
    def predict_proba(self, X):
        """
        Return probability estimates for the RDD containing test vector X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with array-like items , shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the models for each RDD block. The columns correspond to the classes
            in sorted order, as they appear in the attribute `classes_`.
        """
        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(
            lambda X: super(SparkBaseNB, self).predict_proba(X))
Пример #9
0
    def fit(self, Z, classes=None):
        """Fit Gaussian Naive Bayes according to X, y

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target values.

        Returns
        -------
        self : object
            Returns self.
        """
        check_rdd(Z, {'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray)})
        models = Z[:, ['X', 'y']].map(
            lambda X_y: self.partial_fit(X_y[0], X_y[1], classes))
        avg = models.sum()
        self.__dict__.update(avg.__dict__)
        return self
Пример #10
0
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the RDD containing the
        test vector X.

        Parameters
        ----------
        X : RDD containing array-like items, shape = [m_samples, n_features]

        Returns
        -------
        C : RDD with array-like items, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model for each RDD block. The columns correspond to the classes
            in sorted order, as they appear in the attribute `classes_`.
        """
        # required, scikit call self.predict_log_proba(X) in predict_proba
        # and thus this function is call, it must have the same behavior when
        # not called by sparkit-learn
        if not isinstance(X, BlockRDD):
            return super(SparkBaseNB, self).predict_log_proba(X)

        check_rdd(X, (sp.spmatrix, np.ndarray))
        return X.map(lambda X: super(SparkBaseNB, self).predict_log_proba(X))