def fit(self, Z, classes=None): """ TODO fulibacsi fix docstring Fit Multinomial Naive Bayes according to (X,y) pair which is zipped into TupleRDD Z. Parameters ---------- Z : TupleRDD containing X [array-like, shape (m_samples, n_features)] and y [array-like, shape (m_samples,)] tuple Training vectors, where ,_samples is the number of samples in the block and n_features is the number of features, and y contains the target values. Returns ------- self : object Returns self. """ check_rdd(Z, { 'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray) }) if 'w' in Z.columns: models = Z[:, ['X', 'y', 'w']].map(lambda X_y_w: self.partial_fit( X_y_w[0], X_y_w[1], classes, X_y_w[2])) else: models = Z[:, ['X', 'y']].map( lambda X_y: self.partial_fit(X_y[0], X_y[1], classes)) avg = models.sum() self.__dict__.update(avg.__dict__) return self
def fit(self, Z, classes=None): """ TODO fulibacsi fix docstring Fit Multinomial Naive Bayes according to (X,y) pair which is zipped into TupleRDD Z. Parameters ---------- Z : TupleRDD containing X [array-like, shape (m_samples, n_features)] and y [array-like, shape (m_samples,)] tuple Training vectors, where ,_samples is the number of samples in the block and n_features is the number of features, and y contains the target values. Returns ------- self : object Returns self. """ check_rdd(Z, {'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray)}) if 'w' in Z.columns: models = Z[:, ['X', 'y', 'w']].map( lambda X_y_w: self.partial_fit( X_y_w[0], X_y_w[1], classes, X_y_w[2] ) ) else: models = Z[:, ['X', 'y']].map( lambda X_y: self.partial_fit(X_y[0], X_y[1], classes)) avg = models.sum() self.__dict__.update(avg.__dict__) return self
def fit(self, Z, classes=None): """Fit Gaussian Naive Bayes according to X, y Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. Returns ------- self : object Returns self. """ check_rdd(Z, { 'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray) }) models = Z[:, ['X', 'y']].map( lambda X_y: self.partial_fit(X_y[0], X_y[1], classes)) avg = models.reduce(operator.add) self.__dict__.update(avg.__dict__) return self
def predict_log_proba(self, X): """ Return log-probability estimates for the RDD containing the test vector X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with array-like items, shape = [n_samples, n_classes] Returns the log-probability of the samples for each class in the model for each RDD block. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ # required, scikit call self.predict_log_proba(X) in predict_proba # and thus this function is call, it must have the same behavior when # not called by sparkit-learn if not isinstance(X, BlockRDD): return super(SparkBaseNB, self).predict_log_proba(X) check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map( lambda X: super(SparkBaseNB, self).predict_log_proba(X))
def predict(self, X): """ Perform classification on an RDD containing arrays of test vectors X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with arrays, shape = [n_samples] Predicted target values for X """ check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map(lambda X: super(SparkBaseNB, self).predict(X))
def predict(self, X): """ Perform classification on an RDD containing arrays of test vectors X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with arrays, shape = [n_samples] Predicted target values for X """ check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map( lambda X: super(SparkBaseNB, self).predict(X))
def predict_proba(self, X): """ Return probability estimates for the RDD containing test vector X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with array-like items , shape = [n_samples, n_classes] Returns the probability of the samples for each class in the models for each RDD block. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map(lambda X: super(SparkBaseNB, self).predict_proba(X))
def predict_proba(self, X): """ Return probability estimates for the RDD containing test vector X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with array-like items , shape = [n_samples, n_classes] Returns the probability of the samples for each class in the models for each RDD block. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map( lambda X: super(SparkBaseNB, self).predict_proba(X))
def fit(self, Z, classes=None): """Fit Gaussian Naive Bayes according to X, y Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. Returns ------- self : object Returns self. """ check_rdd(Z, {'X': (sp.spmatrix, np.ndarray), 'y': (sp.spmatrix, np.ndarray)}) models = Z[:, ['X', 'y']].map( lambda X_y: self.partial_fit(X_y[0], X_y[1], classes)) avg = models.sum() self.__dict__.update(avg.__dict__) return self
def predict_log_proba(self, X): """ Return log-probability estimates for the RDD containing the test vector X. Parameters ---------- X : RDD containing array-like items, shape = [m_samples, n_features] Returns ------- C : RDD with array-like items, shape = [n_samples, n_classes] Returns the log-probability of the samples for each class in the model for each RDD block. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ # required, scikit call self.predict_log_proba(X) in predict_proba # and thus this function is call, it must have the same behavior when # not called by sparkit-learn if not isinstance(X, BlockRDD): return super(SparkBaseNB, self).predict_log_proba(X) check_rdd(X, (sp.spmatrix, np.ndarray)) return X.map(lambda X: super(SparkBaseNB, self).predict_log_proba(X))