Пример #1
0
def test_csc_row_median():
    # Test csc_row_median actually calculates the median.

    # Test that it gives the same output when X is dense.
    rng = np.random.RandomState(0)
    X = rng.rand(100, 50)
    dense_median = np.median(X, axis=0)
    csc = sp.csc_matrix(X)
    sparse_median = csc_median_axis_0(csc)
    assert_array_equal(sparse_median, dense_median)

    # Test that it gives the same output when X is sparse
    X = rng.rand(51, 100)
    X[X < 0.7] = 0.0
    ind = rng.randint(0, 50, 10)
    X[ind] = -X[ind]
    csc = sp.csc_matrix(X)
    dense_median = np.median(X, axis=0)
    sparse_median = csc_median_axis_0(csc)
    assert_array_equal(sparse_median, dense_median)

    # Test for toy data.
    X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
    csc = sp.csc_matrix(X)
    assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
    X = [[0, -2], [-1, -5], [1, -3]]
    csc = sp.csc_matrix(X)
    assert_array_equal(csc_median_axis_0(csc), np.array([0., -3]))

    # Test that it raises an Error for non-csc matrices.
    assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))
Пример #2
0
def test_csc_row_median():
    # Test csc_row_median actually calculates the median.

    # Test that it gives the same output when X is dense.
    rng = np.random.RandomState(0)
    X = rng.rand(100, 50)
    dense_median = np.median(X, axis=0)
    csc = sp.csc_matrix(X)
    sparse_median = csc_median_axis_0(csc)
    assert_array_equal(sparse_median, dense_median)

    # Test that it gives the same output when X is sparse
    X = rng.rand(51, 100)
    X[X < 0.7] = 0.0
    ind = rng.randint(0, 50, 10)
    X[ind] = -X[ind]
    csc = sp.csc_matrix(X)
    dense_median = np.median(X, axis=0)
    sparse_median = csc_median_axis_0(csc)
    assert_array_equal(sparse_median, dense_median)

    # Test for toy data.
    X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
    csc = sp.csc_matrix(X)
    assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
    X = [[0, -2], [-1, -5], [1, -3]]
    csc = sp.csc_matrix(X)
    assert_array_equal(csc_median_axis_0(csc), np.array([0., -3]))

    # Test that it raises an Error for non-csc matrices.
    assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))
Пример #3
0
def test_kernel_shap_with_a1a_sparse_nonzero_background():
    np.set_printoptions(threshold=100000)
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.utils.sparsefuncs import csc_median_axis_0
    import shap
    np.random.seed(0)

    X, y = shap.datasets.a1a() # pylint: disable=unbalanced-tuple-unpacking
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
    linear_model = LinearRegression()
    linear_model.fit(x_train, y_train)
    # Calculate median of background data
    median_dense = csc_median_axis_0(x_train.tocsc())
    median = sp.sparse.csr_matrix(median_dense)
    explainer = shap.KernelExplainer(linear_model.predict, median)
    shap_values = explainer.shap_values(x_test)

    def dense_to_sparse_predict(data):
        sparse_data = sp.sparse.csr_matrix(data)
        return linear_model.predict(sparse_data)

    explainer_dense = shap.KernelExplainer(dense_to_sparse_predict, median_dense.reshape((1, len(median_dense))))
    x_test_dense = x_test.toarray()
    shap_values_dense = explainer_dense.shap_values(x_test_dense)
    # Validate sparse and dense result is the same
    assert(np.allclose(shap_values, shap_values_dense, rtol=1e-02, atol=1e-01))
def _summarize_data(X, k=10, to_round_values=True):
    """Summarize a dataset.

    For dense dataset, use k mean samples weighted by the number of data points they
    each represent.
    For sparse dataset, use a sparse row for the background with calculated
    median for dense columns.

    :param X: Matrix of data samples to summarize (# samples x # features).
    :type X: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix
    :param k: Number of cluster centroids to use for approximation.
    :type k: int
    :param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value
        from X in the corresponding dimension. This ensures discrete features
        always get a valid value.  Ignored for sparse data sample.
    :type to_round_values: bool
    :return: DenseData or SparseData object.
    :rtype: iml.datatypes.DenseData or iml.datatypes.SparseData
    """
    is_sparse = issparse(X)
    if not isinstance(X, DenseData):
        if is_sparse:
            module_logger.debug('Creating sparse data summary as csr matrix')
            # calculate median of sparse background data
            median_dense = csc_median_axis_0(X.tocsc())
            return csr_matrix(median_dense)
        elif len(X) > 10 * k:
            module_logger.debug('Create dense data summary with k-means')
            # use kmeans to summarize the examples for initialization
            # if there are more than 10 x k of them
            return shap.kmeans(X, k, to_round_values)
    return X
Пример #5
0
def test_kernel_shap_with_a1a_sparse_nonzero_background():
    np.set_printoptions(threshold=100000)
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.utils.sparsefuncs import csc_median_axis_0
    import shap

    X, y = shap.datasets.a1a()  # pylint: disable=unbalanced-tuple-unpacking
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.01,
                                                        random_state=0)
    linear_model = LinearRegression()
    linear_model.fit(x_train, y_train)
    # Calculate median of background data
    median_dense = csc_median_axis_0(x_train.tocsc())
    median = sp.sparse.csr_matrix(median_dense)
    explainer = shap.KernelExplainer(linear_model.predict, median)
    shap_values = explainer.shap_values(x_test)
    # Compare to dense results
    x_train_dense = x_train.toarray()

    def dense_to_sparse_predict(data):
        sparse_data = sp.sparse.csr_matrix(data)
        return linear_model.predict(sparse_data)

    explainer_dense = shap.KernelExplainer(
        linear_model.predict, median_dense.reshape((1, len(median_dense))))
    x_test_dense = x_test.toarray()
    shap_values_dense = explainer_dense.shap_values(x_test_dense)
    # Validate sparse and dense result is the same
    # Note: The default tolerance is almost always fine, but in one out of every
    # 20 runs or so it fails so decreasing it by two orders of magnitude from the default
    assert (np.allclose(shap_values, shap_values_dense, rtol=1e-02,
                        atol=1e-04))
Пример #6
0
def _(x: Union[np.ndarray, spmatrix], **_) -> np.ndarray:
    # unused

    # log 0 -> inf -> masked out anyway, so we select only genes in every cell
    # this is the exact replica an in edgeR
    mask = np.array((x > 0).sum(0)).squeeze() == x.shape[0]
    tmp = x[:, mask]
    if issparse(tmp):
        tmp = tmp.A

    gm = np.array(np.exp(np.mean(np.log(tmp), axis=0))).squeeze()
    gm_mask = (gm > 0) & np.isfinite(gm)

    if not issparse(x):
        return np.median(x[:, mask][:, gm_mask] / gm[gm_mask], axis=1)

    return csc_median_axis_0(x.tocsr()[:, mask][:, gm_mask].multiply(
        1.0 / gm[gm_mask]).tocsr().T)
Пример #7
0
    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """
        if self.metric == 'precomputed':
            raise ValueError("Precomputed is not supported.")
        # If X is sparse and the metric is "manhattan", store it in a csc
        # format is easier to calculate the median.
        if self.metric == 'manhattan':
            X, y = check_X_y(X, y, ['csc'])
        else:
            X, y = check_X_y(X, y, ['csr', 'csc'])
        is_X_sparse = sp.issparse(X)
        if is_X_sparse and self.shrink_threshold:
            raise ValueError("threshold shrinking not supported"
                             " for sparse input")
        check_classification_targets(y)

        n_samples, n_features = X.shape
        le = LabelEncoder()
        y_ind = le.fit_transform(y)
        self.classes_ = classes = le.classes_
        n_classes = classes.size
        if n_classes < 2:
            raise ValueError('The number of classes has to be greater than'
                             ' one; got %d class' % (n_classes))

        # Mask mapping each class to its members.
        self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
        # Number of clusters in each class.
        nk = np.zeros(n_classes)

        for cur_class in range(n_classes):
            center_mask = y_ind == cur_class
            nk[cur_class] = np.sum(center_mask)
            if is_X_sparse:
                center_mask = np.where(center_mask)[0]

            # XXX: Update other averaging methods according to the metrics.
            if self.metric == "manhattan":
                # NumPy does not calculate median of sparse matrices.
                if not is_X_sparse:
                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
                else:
                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
            else:
                if self.metric != 'euclidean':
                    warnings.warn("Averaging for metrics other than "
                                  "euclidean and manhattan not supported. "
                                  "The average is set to be the mean."
                                  )
                self.centroids_[cur_class] = X[center_mask].mean(axis=0)

        if self.shrink_threshold:
            dataset_centroid_ = np.mean(X, axis=0)

            # m parameter for determining deviation
            m = np.sqrt((1. / nk) - (1. / n_samples))
            # Calculate deviation using the standard deviation of centroids.
            variance = (X - self.centroids_[y_ind]) ** 2
            variance = variance.sum(axis=0)
            s = np.sqrt(variance / (n_samples - n_classes))
            s += np.median(s)  # To deter outliers from affecting the results.
            mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
            ms = mm * s
            deviation = ((self.centroids_ - dataset_centroid_) / ms)
            # Soft thresholding: if the deviation crosses 0 during shrinking,
            # it becomes zero.
            signs = np.sign(deviation)
            deviation = (np.abs(deviation) - self.shrink_threshold)
            np.clip(deviation, 0, None, out=deviation)
            deviation *= signs
            # Now adjust the centroids using the deviation
            msd = ms * deviation
            self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
        return self
Пример #8
0
def get_ranked_phrases(nlp,
                       raw_documents,
                       timestamps=None,
                       *,
                       include_verb_phrases=False,
                       minlen=1,
                       maxlen=8,
                       n_jobs=_default_n_jobs,
                       batch_size=_default_batch_size,
                       stop_phrases=[],
                       vectorizer='bngram',
                       aggfunc='sum',
                       **vectorizer_kws):
    """
    Get phrases ranked by either TF-IDF (importance) score or BNgram (novelty) score.

    Parameters
    ----------
    nlp : spacy.language.Language
        Spacy language model

    raw_documents : Iterable[str]
        An iterable which yields either str objects.

    timestamps : Iterable[str]
        timestamp of the documents. An iterable which
        yields datetime objects. Only used when
        `vectorizer='bngram'`.

    include_verb_phrases : bool, default=False
        Indicator to include verb phrases also.

    minlen : int, default=1
        Minimum length of extracted multi-word phrases.
        Used for tokenizing the text.

    maxlen : int, default=8
        Maximum length of extracted multi-word phrases.
        Used for tokenizing the text.

    n_jobs : int, default=-1
        Number of processes to get noun phrases in parallel
        from documents.
            * -1: Use one process per available CPU cores
            * >0: Use `n_jobs` processes

    batch_size : int, default=1000
        Batch size for tokenizing, tagging and extracting
        noun phrases. Use smaller batch sizes on large
        number of large texts and vice-versa.

    stop_phrases : List[str], default=[]
        List of phrases to remove.

    vectorizer : str, default='bngram'
        One of ('bngram', 'tfidf').

    aggfunc : Union[str, callable, NoneType], default='sum'
        Function to aggregate over the scores per document
        for a single phrase to rank. One of ('sum', 'mean',
        'max', 'median', 'median_ignore_0', callable that
        accepts sparse matrix, None). If None, this function
        will return the vectorized documents and the vectorizer
        directly.

    vectorizer_kws : dict
        Keyword arguments for TfidfVectorizer

    Returns
    -------
    ranked_phrases : Union[pandas.DataFrame, Tuple[array[N, M], vectorizer]]
        If aggfunc is not None, returns the dataframe with the extracted
        n-gram / phrase and sorted descending by the aggregated bngram /
        td-idf scores, else returns the vectorized documents (where
        N=len(raw_documents) and M=len(phrases)) and the vectorizer object,
    """
    assert vectorizer in ('bngram', 'tfidf')
    stop_phrases = set(stop_phrases)

    # get candidate phrases
    nlp.add_pipe(
        NounPhraseMatcher(lowercase=True,
                          lemmatize=True,
                          include_verb_phrases=include_verb_phrases,
                          minlen=minlen,
                          maxlen=maxlen))

    # extract phrases
    def process_chunk(texts):
        return list(nlp.pipe(texts))

    logger.info('Tokenizing, tagging and extracting noun phrases '
                'per documents with spacy')
    n_jobs = psutil.cpu_count(logical=False)\
        if n_jobs == -1 else n_jobs
    raw_documents = list(
        nlp.pipe(raw_documents, batch_size=batch_size, n_process=n_jobs))

    # vectorize the texts
    if 'norm' in vectorizer_kws and aggfunc is not None:
        warnings.warn(
            "'vectorizer_kws' should not contain 'norm'. "
            "'vectorizer_kws['norm']' will be replaced.", UserWarning)
        vectorizer_kws['norm'] = None
    if 'analyzer' in vectorizer_kws:
        warnings.warn(
            "'vectorizer_kws' should not contain 'analyzer'. "
            "'vectorizer_kws['analyzer']' will be replaced.", UserWarning)
    vectorizer_kws['analyzer'] = lambda doc: [
        p for p in doc._.noun_phrases if p not in stop_phrases
    ]
    if vectorizer == 'bngram':
        if timestamps is None:
            raise ValueError(
                'Parameter `timestamps` cannot be None if `vectorizer=bngram`.'
            )
        vectorizer = BngramsVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with BNgrams')
        X = vectorizer.fit_transform(raw_documents, timestamps)
    elif vectorizer == 'tfidf':
        vectorizer = TfidfVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with TF-IDF')
        X = vectorizer.fit_transform(raw_documents)
    else:
        raise ValueError(f'Unknown vectorizer={vectorizer} given.')

    logger.info('Scoring phrases')
    if aggfunc == 'sum':
        scores = np.array(X.tocsc().sum(0))[0]
    elif aggfunc == 'mean':
        scores = np.array(X.tocsc().mean(0))[0]
    elif aggfunc == 'max':
        scores = min_max_axis(X.tocsc(), axis=0, ignore_nan=True)[1]
    elif aggfunc == 'median':
        scores = csc_median_axis_0(X.tocsc())
    elif aggfunc == 'median_ignore_0':
        scores = _get_median(X.tocsc(), 0)
    elif callable(aggfunc):
        scores = aggfunc(X.tocsc())
    elif aggfunc is None:
        return X, vectorizer
    else:
        raise ValueError(f'Unknown method: {aggfunc}')

    logger.info('Rank phrases based on score')
    ranked_phrases = pd.DataFrame(list(
        zip(vectorizer.get_feature_names(), scores)),
                                  columns=['phrase', 'score'])
    ranked_phrases = ranked_phrases\
        .sort_values('score', ascending=False)\
        .reset_index(drop=True)

    return ranked_phrases