예제 #1
0
def test_min_max_axis1():
    X = np.array([[0, 3, 0],
                  [2, -1, 0],
                  [0, 0, 0],
                  [9, 8, 7],
                  [4, 0, 5]], dtype=np.float64)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)

    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))

    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))

    X = X.astype(np.float32)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))
    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))
예제 #2
0
def test_min_max_axis1():
    X = np.array([[0, 3, 0],
                  [2, -1, 0],
                  [0, 0, 0],
                  [9, 8, 7],
                  [4, 0, 5]], dtype=np.float64)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)

    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))

    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))

    X = X.astype(np.float32)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))
    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))
예제 #3
0
def test_min_max(
    dtype,
    axis,
    sparse_format,
    missing_values,
    min_func,
    max_func,
    ignore_nan,
    large_indices,
):
    X = np.array(
        [
            [0, 3, 0],
            [2, -1, missing_values],
            [0, 0, 0],
            [9, missing_values, 7],
            [4, 0, 5],
        ],
        dtype=dtype,
    )
    X_sparse = sparse_format(X)
    if large_indices:
        X_sparse.indices = X_sparse.indices.astype("int64")
        X_sparse.indptr = X_sparse.indptr.astype("int64")

    mins_sparse, maxs_sparse = min_max_axis(X_sparse,
                                            axis=axis,
                                            ignore_nan=ignore_nan)
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
예제 #4
0
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan):
    X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0],
                  [9, missing_values, 7], [4, 0, 5]],
                 dtype=dtype)
    X_sparse = sparse_format(X)

    mins_sparse, maxs_sparse = min_max_axis(X_sparse,
                                            axis=axis,
                                            ignore_nan=ignore_nan)
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
예제 #5
0
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan):
    X = np.array([[0, 3, 0],
                  [2, -1, missing_values],
                  [0, 0, 0],
                  [9, missing_values, 7],
                  [4, 0, 5]], dtype=dtype)
    X_sparse = sparse_format(X)

    mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
                                            ignore_nan=ignore_nan)
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan, large_indices):
    X = np.array([[0, 3, 0],
                  [2, -1, missing_values],
                  [0, 0, 0],
                  [9, missing_values, 7],
                  [4, 0, 5]], dtype=dtype)
    X_sparse = sparse_format(X)
    if large_indices:
        X_sparse.indices = X_sparse.indices.astype('int64')
        X_sparse.indptr = X_sparse.indptr.astype('int64')

    mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
                                            ignore_nan=ignore_nan)
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
예제 #7
0
def test_min_max_axis_errors():
    X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]],
                 dtype=np.float64)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    with pytest.raises(TypeError):
        min_max_axis(X_csr.tolil(), axis=0)
    with pytest.raises(ValueError):
        min_max_axis(X_csr, axis=2)
    with pytest.raises(ValueError):
        min_max_axis(X_csc, axis=-3)
예제 #8
0
def _inverse_binarize_multiclass(y, classes):
    """Inverse label binarization transformation for multiclass.

    Multiclass uses the maximal score instead of a threshold.
    """
    classes = np.asarray(classes)

    if sp.issparse(y):
        # Find the argmax for each row in y where y is a CSR matrix

        y = y.tocsr()
        n_samples, n_outputs = y.shape
        outputs = np.arange(n_outputs)
        row_max = min_max_axis(y, 1)[1]
        row_nnz = np.diff(y.indptr)

        y_data_repeated_max = np.repeat(row_max, row_nnz)
        # picks out all indices obtaining the maximum per row
        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)

        # For corner case where last row has a max of 0
        if row_max[-1] == 0:
            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])

        # Gets the index of the first argmax in each row from y_i_all_argmax
        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
        # first argmax of each row
        y_ind_ext = np.append(y.indices, [0])
        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
        # Handle rows of all 0
        y_i_argmax[np.where(row_nnz == 0)[0]] = 0

        # Handles rows with max of 0 that contain negative numbers
        samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
        for i in samples:
            ind = y.indices[y.indptr[i]:y.indptr[i + 1]]
            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]

        return classes[y_i_argmax]
    else:
        return classes.take(y.argmax(axis=1), mode="clip")
예제 #9
0
    def fit(self, X):
        """
        Used to fit Noramlizer with data
        :param X: list
        :return: nothing
        """
        if self.norm not in ('l1', 'l2', 'max'):
            raise ValueError("'%s' is not a supported norm" % self.norm)

        if self.axis == 0:
            self.sparse_format = 'csc'
        elif self.axis == 1:
            self.sparse_format = 'csr'
        else:
            raise ValueError("'%d' is not a supported axis" % self.axis)

        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T

        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                _, self.norms = min_max_axis(X, 1)
        else:
            if self.norm == 'l1':
                self.norms = np.abs(X).sum(axis=1)
            elif self.norm == 'l2':
                self.norms = row_norms(X)
            elif self.norm == 'max':
                self.norms = np.max(X, axis=1)
            self.norms = _handle_zeros_in_scale(self.norms, copy=False)
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False, shrink=0):
    """Scale input vectors individually to unit norm (vector length).

    Read more in the :ref:`User Guide <preprocessing_normalization>`.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1', 'l2', or 'max', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    return_norm : boolean, default False
        whether to return the computed norms

    Returns
    -------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        Normalized input X.

    norms : array, shape [n_samples] if axis=1 else [n_features]
        An array of norms along given axis for X.
        When X is sparse, a NotImplementedError will be raised
        for norm 'l1' or 'l2'.

    See also
    --------
    Normalizer: Performs normalization using the ``Transformer`` API
        (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).

    Notes
    -----
    For a comparison of the different scalers, transformers, and normalizers,
    see :ref:`examples/preprocessing/plot_all_scaling.py
    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.

    """
    if norm not in ('l1', 'l2', 'max'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X,
                    sparse_format,
                    copy=copy,
                    estimator='the normalize function',
                    dtype=FLOAT_DTYPES)
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        if return_norm and norm in ('l1', 'l2'):
            raise NotImplementedError("return_norm=True is not implemented "
                                      "for sparse matrices with norm 'l1' "
                                      "or norm 'l2'")
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X, shrink)
        elif norm == 'max':
            _, norms = min_max_axis(X, 1)
            norms_elementwise = norms.repeat(np.diff(X.indptr))
            mask = norms_elementwise != 0
            X.data[mask] /= norms_elementwise[mask]
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
        elif norm == 'l2':
            norms = row_norms(X)
        elif norm == 'max':
            norms = np.max(X, axis=1)
        norms = _handle_zeros_in_scale(norms, copy=False)
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    if return_norm:
        return X, norms
    else:
        return X
예제 #11
0
def csr_summ(x):
    mean, var = sparsefuncs.mean_variance_axis(x, 0)
    min_val, max_val = sparsefuncs.min_max_axis(x, 0)
    return np.hstack(
        [mean + 0.005, var + 0.005, min_val + 0.005, max_val + 0.005])
예제 #12
0
def CorrelationThreshold(X, threshold, kind):
    """Learn empirical variances from X.
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training set to compute correlations.
    y : ignored
        Not used, present here for API consistency by convention.
    Returns
    -------
    support_mask : Boolean array for feature selection
    """
    
    if not (0.0 <= threshold <= 1.0):
        raise BFE.from_errors([{'0100': 'Threshold value must in [0.0, 1.0]'}])
    
    if kind not in ('pearson', 'spearmanr'):
        raise BFE.from_errors([{'0100': "Kind must be 'pearson' or 'spearmanr"}])
        
    if issparse(X) and kind != 'pearson':
        raise BFE.from_errors([{'0100': "Only pearson correlation is supported with 'sparse matrices'"}])

    X = check_array(X, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32])
    
    n_features = X.shape[1]
    if threshold == 1 or (1 in X.shape):
        support_mask = np.ones(n_features, dtype=np.bool)
        return support_mask
    
    # get constant features
    if issparse(X):
        mins, maxes = min_max_axis(X, axis=0)
        peak_to_peaks = maxes - mins
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        
        # sparse correlation
        mu, sparse_var = mean_variance_axis(X, 0)
        X_corr = sparse_correlation(X, mu, ~constant_mask)
    else:
        peak_to_peaks = np.ptp(X, axis=0)
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        
        if kind == 'pearson':
            X_corr = np.corrcoef(X, rowvar=False)
        else: # spearmanr
            X_corr, _ = spearmanr(X)
            # spearmanr returns scaler when comparing two columns
            if isinstance(X_corr, float):
                X_corr = np.array([[1, X_corr], [X_corr, 1]])
    
    np.fabs(X_corr, out=X_corr)
    
    # Removes constant features from support_mask
    support_mask = np.ones(n_features, dtype=np.bool)
    upper_idx = np.triu_indices(n_features, 1)
    
    non_constant_features = n_features
    for i in np.flatnonzero(constant_mask):
        feat_remove_mask = np.logical_and(upper_idx[0] != i,
                                          upper_idx[1] != i)
        upper_idx = (upper_idx[0][feat_remove_mask],
                     upper_idx[1][feat_remove_mask])
        support_mask[i] = False
        non_constant_features -= 1
    
    for _ in range(non_constant_features -1):
        max_idx = np.argmax(X_corr[upper_idx])
        feat1, feat2 = upper_idx[0][max_idx], upper_idx[1][max_idx]
        cur_corr = X_corr[feat1, feat2]
        
        # max correlation is lower than threshold
        if cur_corr < threshold:
            break
        
        # Temporary remove both features to calculate the mean with other
        # features. One of the featuers will be selected.
        support_mask[[feat1, feat2]] = False
        
        # if there are no other features to compare, keep the feature with the most
        # variance
        if np.all(~support_mask):
            if issparse(X):
                # sparse precalculates variance for all features
                var = sparse_var[[feat1, feat2]]
            else:
                var = np.var(X[:, [feat1, feat2]], axis=0)

            print(feat1, feat2)
            if var[0] < var[1]:
                support_mask[feat2] = True
            else:
                support_mask[feat1] = True
            break
            
        # mean with other features
        feat1_mean = np.mean(X_corr[feat1, support_mask])
        feat2_mean = np.mean(X_corr[feat2, support_mask])
        
        # feature with lower mean is kept
        if feat1_mean < feat2_mean:
            support_mask[feat1] = True
            feat_to_remove = feat2
        else:
            support_mask[feat2] = True
            feat_to_remove = feat1
        
        # remove the removed feature from consideration
        upper_idx_to_keep = np.logical_and(upper_idx[0] != feat_to_remove,
                                           upper_idx[1] != feat_to_remove)
        upper_idx = (upper_idx[0][upper_idx_to_keep],
                     upper_idx[1][upper_idx_to_keep])

    return support_mask
예제 #13
0
def get_ranked_phrases(nlp,
                       raw_documents,
                       timestamps=None,
                       *,
                       include_verb_phrases=False,
                       minlen=1,
                       maxlen=8,
                       n_jobs=_default_n_jobs,
                       batch_size=_default_batch_size,
                       stop_phrases=[],
                       vectorizer='bngram',
                       aggfunc='sum',
                       **vectorizer_kws):
    """
    Get phrases ranked by either TF-IDF (importance) score or BNgram (novelty) score.

    Parameters
    ----------
    nlp : spacy.language.Language
        Spacy language model

    raw_documents : Iterable[str]
        An iterable which yields either str objects.

    timestamps : Iterable[str]
        timestamp of the documents. An iterable which
        yields datetime objects. Only used when
        `vectorizer='bngram'`.

    include_verb_phrases : bool, default=False
        Indicator to include verb phrases also.

    minlen : int, default=1
        Minimum length of extracted multi-word phrases.
        Used for tokenizing the text.

    maxlen : int, default=8
        Maximum length of extracted multi-word phrases.
        Used for tokenizing the text.

    n_jobs : int, default=-1
        Number of processes to get noun phrases in parallel
        from documents.
            * -1: Use one process per available CPU cores
            * >0: Use `n_jobs` processes

    batch_size : int, default=1000
        Batch size for tokenizing, tagging and extracting
        noun phrases. Use smaller batch sizes on large
        number of large texts and vice-versa.

    stop_phrases : List[str], default=[]
        List of phrases to remove.

    vectorizer : str, default='bngram'
        One of ('bngram', 'tfidf').

    aggfunc : Union[str, callable, NoneType], default='sum'
        Function to aggregate over the scores per document
        for a single phrase to rank. One of ('sum', 'mean',
        'max', 'median', 'median_ignore_0', callable that
        accepts sparse matrix, None). If None, this function
        will return the vectorized documents and the vectorizer
        directly.

    vectorizer_kws : dict
        Keyword arguments for TfidfVectorizer

    Returns
    -------
    ranked_phrases : Union[pandas.DataFrame, Tuple[array[N, M], vectorizer]]
        If aggfunc is not None, returns the dataframe with the extracted
        n-gram / phrase and sorted descending by the aggregated bngram /
        td-idf scores, else returns the vectorized documents (where
        N=len(raw_documents) and M=len(phrases)) and the vectorizer object,
    """
    assert vectorizer in ('bngram', 'tfidf')
    stop_phrases = set(stop_phrases)

    # get candidate phrases
    nlp.add_pipe(
        NounPhraseMatcher(lowercase=True,
                          lemmatize=True,
                          include_verb_phrases=include_verb_phrases,
                          minlen=minlen,
                          maxlen=maxlen))

    # extract phrases
    def process_chunk(texts):
        return list(nlp.pipe(texts))

    logger.info('Tokenizing, tagging and extracting noun phrases '
                'per documents with spacy')
    n_jobs = psutil.cpu_count(logical=False)\
        if n_jobs == -1 else n_jobs
    raw_documents = list(
        nlp.pipe(raw_documents, batch_size=batch_size, n_process=n_jobs))

    # vectorize the texts
    if 'norm' in vectorizer_kws and aggfunc is not None:
        warnings.warn(
            "'vectorizer_kws' should not contain 'norm'. "
            "'vectorizer_kws['norm']' will be replaced.", UserWarning)
        vectorizer_kws['norm'] = None
    if 'analyzer' in vectorizer_kws:
        warnings.warn(
            "'vectorizer_kws' should not contain 'analyzer'. "
            "'vectorizer_kws['analyzer']' will be replaced.", UserWarning)
    vectorizer_kws['analyzer'] = lambda doc: [
        p for p in doc._.noun_phrases if p not in stop_phrases
    ]
    if vectorizer == 'bngram':
        if timestamps is None:
            raise ValueError(
                'Parameter `timestamps` cannot be None if `vectorizer=bngram`.'
            )
        vectorizer = BngramsVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with BNgrams')
        X = vectorizer.fit_transform(raw_documents, timestamps)
    elif vectorizer == 'tfidf':
        vectorizer = TfidfVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with TF-IDF')
        X = vectorizer.fit_transform(raw_documents)
    else:
        raise ValueError(f'Unknown vectorizer={vectorizer} given.')

    logger.info('Scoring phrases')
    if aggfunc == 'sum':
        scores = np.array(X.tocsc().sum(0))[0]
    elif aggfunc == 'mean':
        scores = np.array(X.tocsc().mean(0))[0]
    elif aggfunc == 'max':
        scores = min_max_axis(X.tocsc(), axis=0, ignore_nan=True)[1]
    elif aggfunc == 'median':
        scores = csc_median_axis_0(X.tocsc())
    elif aggfunc == 'median_ignore_0':
        scores = _get_median(X.tocsc(), 0)
    elif callable(aggfunc):
        scores = aggfunc(X.tocsc())
    elif aggfunc is None:
        return X, vectorizer
    else:
        raise ValueError(f'Unknown method: {aggfunc}')

    logger.info('Rank phrases based on score')
    ranked_phrases = pd.DataFrame(list(
        zip(vectorizer.get_feature_names(), scores)),
                                  columns=['phrase', 'score'])
    ranked_phrases = ranked_phrases\
        .sort_values('score', ascending=False)\
        .reset_index(drop=True)

    return ranked_phrases