def test_csc_row_median(): # Test csc_row_median actually calculates the median. # Test that it gives the same output when X is dense. rng = np.random.RandomState(0) X = rng.rand(100, 50) dense_median = np.median(X, axis=0) csc = sp.csc_matrix(X) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test that it gives the same output when X is sparse X = rng.rand(51, 100) X[X < 0.7] = 0.0 ind = rng.randint(0, 50, 10) X[ind] = -X[ind] csc = sp.csc_matrix(X) dense_median = np.median(X, axis=0) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test for toy data. X = [[0, -2], [-1, -1], [1, 0], [2, 1]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5])) X = [[0, -2], [-1, -5], [1, -3]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0., -3])) # Test that it raises an Error for non-csc matrices. assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))
def test_kernel_shap_with_a1a_sparse_nonzero_background(): np.set_printoptions(threshold=100000) from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.utils.sparsefuncs import csc_median_axis_0 import shap np.random.seed(0) X, y = shap.datasets.a1a() # pylint: disable=unbalanced-tuple-unpacking x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0) linear_model = LinearRegression() linear_model.fit(x_train, y_train) # Calculate median of background data median_dense = csc_median_axis_0(x_train.tocsc()) median = sp.sparse.csr_matrix(median_dense) explainer = shap.KernelExplainer(linear_model.predict, median) shap_values = explainer.shap_values(x_test) def dense_to_sparse_predict(data): sparse_data = sp.sparse.csr_matrix(data) return linear_model.predict(sparse_data) explainer_dense = shap.KernelExplainer(dense_to_sparse_predict, median_dense.reshape((1, len(median_dense)))) x_test_dense = x_test.toarray() shap_values_dense = explainer_dense.shap_values(x_test_dense) # Validate sparse and dense result is the same assert(np.allclose(shap_values, shap_values_dense, rtol=1e-02, atol=1e-01))
def _summarize_data(X, k=10, to_round_values=True): """Summarize a dataset. For dense dataset, use k mean samples weighted by the number of data points they each represent. For sparse dataset, use a sparse row for the background with calculated median for dense columns. :param X: Matrix of data samples to summarize (# samples x # features). :type X: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix :param k: Number of cluster centroids to use for approximation. :type k: int :param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value from X in the corresponding dimension. This ensures discrete features always get a valid value. Ignored for sparse data sample. :type to_round_values: bool :return: DenseData or SparseData object. :rtype: iml.datatypes.DenseData or iml.datatypes.SparseData """ is_sparse = issparse(X) if not isinstance(X, DenseData): if is_sparse: module_logger.debug('Creating sparse data summary as csr matrix') # calculate median of sparse background data median_dense = csc_median_axis_0(X.tocsc()) return csr_matrix(median_dense) elif len(X) > 10 * k: module_logger.debug('Create dense data summary with k-means') # use kmeans to summarize the examples for initialization # if there are more than 10 x k of them return shap.kmeans(X, k, to_round_values) return X
def test_kernel_shap_with_a1a_sparse_nonzero_background(): np.set_printoptions(threshold=100000) from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.utils.sparsefuncs import csc_median_axis_0 import shap X, y = shap.datasets.a1a() # pylint: disable=unbalanced-tuple-unpacking x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0) linear_model = LinearRegression() linear_model.fit(x_train, y_train) # Calculate median of background data median_dense = csc_median_axis_0(x_train.tocsc()) median = sp.sparse.csr_matrix(median_dense) explainer = shap.KernelExplainer(linear_model.predict, median) shap_values = explainer.shap_values(x_test) # Compare to dense results x_train_dense = x_train.toarray() def dense_to_sparse_predict(data): sparse_data = sp.sparse.csr_matrix(data) return linear_model.predict(sparse_data) explainer_dense = shap.KernelExplainer( linear_model.predict, median_dense.reshape((1, len(median_dense)))) x_test_dense = x_test.toarray() shap_values_dense = explainer_dense.shap_values(x_test_dense) # Validate sparse and dense result is the same # Note: The default tolerance is almost always fine, but in one out of every # 20 runs or so it fails so decreasing it by two orders of magnitude from the default assert (np.allclose(shap_values, shap_values_dense, rtol=1e-02, atol=1e-04))
def _(x: Union[np.ndarray, spmatrix], **_) -> np.ndarray: # unused # log 0 -> inf -> masked out anyway, so we select only genes in every cell # this is the exact replica an in edgeR mask = np.array((x > 0).sum(0)).squeeze() == x.shape[0] tmp = x[:, mask] if issparse(tmp): tmp = tmp.A gm = np.array(np.exp(np.mean(np.log(tmp), axis=0))).squeeze() gm_mask = (gm > 0) & np.isfinite(gm) if not issparse(x): return np.median(x[:, mask][:, gm_mask] / gm[gm_mask], axis=1) return csc_median_axis_0(x.tocsr()[:, mask][:, gm_mask].multiply( 1.0 / gm[gm_mask]).tocsr().T)
def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) """ if self.metric == 'precomputed': raise ValueError("Precomputed is not supported.") # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': X, y = check_X_y(X, y, ['csc']) else: X, y = check_X_y(X, y, ['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" " for sparse input") check_classification_targets(y) n_samples, n_features = X.shape le = LabelEncoder() y_ind = le.fit_transform(y) self.classes_ = classes = le.classes_ n_classes = classes.size if n_classes < 2: raise ValueError('The number of classes has to be greater than' ' one; got %d class' % (n_classes)) # Mask mapping each class to its members. self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64) # Number of clusters in each class. nk = np.zeros(n_classes) for cur_class in range(n_classes): center_mask = y_ind == cur_class nk[cur_class] = np.sum(center_mask) if is_X_sparse: center_mask = np.where(center_mask)[0] # XXX: Update other averaging methods according to the metrics. if self.metric == "manhattan": # NumPy does not calculate median of sparse matrices. if not is_X_sparse: self.centroids_[cur_class] = np.median(X[center_mask], axis=0) else: self.centroids_[cur_class] = csc_median_axis_0(X[center_mask]) else: if self.metric != 'euclidean': warnings.warn("Averaging for metrics other than " "euclidean and manhattan not supported. " "The average is set to be the mean." ) self.centroids_[cur_class] = X[center_mask].mean(axis=0) if self.shrink_threshold: dataset_centroid_ = np.mean(X, axis=0) # m parameter for determining deviation m = np.sqrt((1. / nk) - (1. / n_samples)) # Calculate deviation using the standard deviation of centroids. variance = (X - self.centroids_[y_ind]) ** 2 variance = variance.sum(axis=0) s = np.sqrt(variance / (n_samples - n_classes)) s += np.median(s) # To deter outliers from affecting the results. mm = m.reshape(len(m), 1) # Reshape to allow broadcasting. ms = mm * s deviation = ((self.centroids_ - dataset_centroid_) / ms) # Soft thresholding: if the deviation crosses 0 during shrinking, # it becomes zero. signs = np.sign(deviation) deviation = (np.abs(deviation) - self.shrink_threshold) np.clip(deviation, 0, None, out=deviation) deviation *= signs # Now adjust the centroids using the deviation msd = ms * deviation self.centroids_ = dataset_centroid_[np.newaxis, :] + msd return self
def get_ranked_phrases(nlp, raw_documents, timestamps=None, *, include_verb_phrases=False, minlen=1, maxlen=8, n_jobs=_default_n_jobs, batch_size=_default_batch_size, stop_phrases=[], vectorizer='bngram', aggfunc='sum', **vectorizer_kws): """ Get phrases ranked by either TF-IDF (importance) score or BNgram (novelty) score. Parameters ---------- nlp : spacy.language.Language Spacy language model raw_documents : Iterable[str] An iterable which yields either str objects. timestamps : Iterable[str] timestamp of the documents. An iterable which yields datetime objects. Only used when `vectorizer='bngram'`. include_verb_phrases : bool, default=False Indicator to include verb phrases also. minlen : int, default=1 Minimum length of extracted multi-word phrases. Used for tokenizing the text. maxlen : int, default=8 Maximum length of extracted multi-word phrases. Used for tokenizing the text. n_jobs : int, default=-1 Number of processes to get noun phrases in parallel from documents. * -1: Use one process per available CPU cores * >0: Use `n_jobs` processes batch_size : int, default=1000 Batch size for tokenizing, tagging and extracting noun phrases. Use smaller batch sizes on large number of large texts and vice-versa. stop_phrases : List[str], default=[] List of phrases to remove. vectorizer : str, default='bngram' One of ('bngram', 'tfidf'). aggfunc : Union[str, callable, NoneType], default='sum' Function to aggregate over the scores per document for a single phrase to rank. One of ('sum', 'mean', 'max', 'median', 'median_ignore_0', callable that accepts sparse matrix, None). If None, this function will return the vectorized documents and the vectorizer directly. vectorizer_kws : dict Keyword arguments for TfidfVectorizer Returns ------- ranked_phrases : Union[pandas.DataFrame, Tuple[array[N, M], vectorizer]] If aggfunc is not None, returns the dataframe with the extracted n-gram / phrase and sorted descending by the aggregated bngram / td-idf scores, else returns the vectorized documents (where N=len(raw_documents) and M=len(phrases)) and the vectorizer object, """ assert vectorizer in ('bngram', 'tfidf') stop_phrases = set(stop_phrases) # get candidate phrases nlp.add_pipe( NounPhraseMatcher(lowercase=True, lemmatize=True, include_verb_phrases=include_verb_phrases, minlen=minlen, maxlen=maxlen)) # extract phrases def process_chunk(texts): return list(nlp.pipe(texts)) logger.info('Tokenizing, tagging and extracting noun phrases ' 'per documents with spacy') n_jobs = psutil.cpu_count(logical=False)\ if n_jobs == -1 else n_jobs raw_documents = list( nlp.pipe(raw_documents, batch_size=batch_size, n_process=n_jobs)) # vectorize the texts if 'norm' in vectorizer_kws and aggfunc is not None: warnings.warn( "'vectorizer_kws' should not contain 'norm'. " "'vectorizer_kws['norm']' will be replaced.", UserWarning) vectorizer_kws['norm'] = None if 'analyzer' in vectorizer_kws: warnings.warn( "'vectorizer_kws' should not contain 'analyzer'. " "'vectorizer_kws['analyzer']' will be replaced.", UserWarning) vectorizer_kws['analyzer'] = lambda doc: [ p for p in doc._.noun_phrases if p not in stop_phrases ] if vectorizer == 'bngram': if timestamps is None: raise ValueError( 'Parameter `timestamps` cannot be None if `vectorizer=bngram`.' ) vectorizer = BngramsVectorizer(**vectorizer_kws) logger.info('Vectorizing documents with BNgrams') X = vectorizer.fit_transform(raw_documents, timestamps) elif vectorizer == 'tfidf': vectorizer = TfidfVectorizer(**vectorizer_kws) logger.info('Vectorizing documents with TF-IDF') X = vectorizer.fit_transform(raw_documents) else: raise ValueError(f'Unknown vectorizer={vectorizer} given.') logger.info('Scoring phrases') if aggfunc == 'sum': scores = np.array(X.tocsc().sum(0))[0] elif aggfunc == 'mean': scores = np.array(X.tocsc().mean(0))[0] elif aggfunc == 'max': scores = min_max_axis(X.tocsc(), axis=0, ignore_nan=True)[1] elif aggfunc == 'median': scores = csc_median_axis_0(X.tocsc()) elif aggfunc == 'median_ignore_0': scores = _get_median(X.tocsc(), 0) elif callable(aggfunc): scores = aggfunc(X.tocsc()) elif aggfunc is None: return X, vectorizer else: raise ValueError(f'Unknown method: {aggfunc}') logger.info('Rank phrases based on score') ranked_phrases = pd.DataFrame(list( zip(vectorizer.get_feature_names(), scores)), columns=['phrase', 'score']) ranked_phrases = ranked_phrases\ .sort_values('score', ascending=False)\ .reset_index(drop=True) return ranked_phrases