def test_min_max_axis1(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=1) assert_array_equal(mins_csr, X.min(axis=1)) assert_array_equal(maxs_csr, X.max(axis=1)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=1) assert_array_equal(mins_csc, X.min(axis=1)) assert_array_equal(maxs_csc, X.max(axis=1)) X = X.astype(np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=1) assert_array_equal(mins_csr, X.min(axis=1)) assert_array_equal(maxs_csr, X.max(axis=1)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=1) assert_array_equal(mins_csc, X.min(axis=1)) assert_array_equal(maxs_csc, X.max(axis=1))
def test_min_max( dtype, axis, sparse_format, missing_values, min_func, max_func, ignore_nan, large_indices, ): X = np.array( [ [0, 3, 0], [2, -1, missing_values], [0, 0, 0], [9, missing_values, 7], [4, 0, 5], ], dtype=dtype, ) X_sparse = sparse_format(X) if large_indices: X_sparse.indices = X_sparse.indices.astype("int64") X_sparse.indptr = X_sparse.indptr.astype("int64") mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan) assert_array_equal(mins_sparse, min_func(X, axis=axis)) assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max(dtype, axis, sparse_format, missing_values, min_func, max_func, ignore_nan): X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0], [9, missing_values, 7], [4, 0, 5]], dtype=dtype) X_sparse = sparse_format(X) mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan) assert_array_equal(mins_sparse, min_func(X, axis=axis)) assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max(dtype, axis, sparse_format, missing_values, min_func, max_func, ignore_nan, large_indices): X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0], [9, missing_values, 7], [4, 0, 5]], dtype=dtype) X_sparse = sparse_format(X) if large_indices: X_sparse.indices = X_sparse.indices.astype('int64') X_sparse.indptr = X_sparse.indptr.astype('int64') mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan) assert_array_equal(mins_sparse, min_func(X, axis=axis)) assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max_axis_errors(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) with pytest.raises(TypeError): min_max_axis(X_csr.tolil(), axis=0) with pytest.raises(ValueError): min_max_axis(X_csr, axis=2) with pytest.raises(ValueError): min_max_axis(X_csc, axis=-3)
def _inverse_binarize_multiclass(y, classes): """Inverse label binarization transformation for multiclass. Multiclass uses the maximal score instead of a threshold. """ classes = np.asarray(classes) if sp.issparse(y): # Find the argmax for each row in y where y is a CSR matrix y = y.tocsr() n_samples, n_outputs = y.shape outputs = np.arange(n_outputs) row_max = min_max_axis(y, 1)[1] row_nnz = np.diff(y.indptr) y_data_repeated_max = np.repeat(row_max, row_nnz) # picks out all indices obtaining the maximum per row y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) # For corner case where last row has a max of 0 if row_max[-1] == 0: y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) # Gets the index of the first argmax in each row from y_i_all_argmax index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) # first argmax of each row y_ind_ext = np.append(y.indices, [0]) y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] # Handle rows of all 0 y_i_argmax[np.where(row_nnz == 0)[0]] = 0 # Handles rows with max of 0 that contain negative numbers samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] for i in samples: ind = y.indices[y.indptr[i]:y.indptr[i + 1]] y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] return classes[y_i_argmax] else: return classes.take(y.argmax(axis=1), mode="clip")
def fit(self, X): """ Used to fit Noramlizer with data :param X: list :return: nothing """ if self.norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % self.norm) if self.axis == 0: self.sparse_format = 'csc' elif self.axis == 1: self.sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % self.axis) X = check_array(X, self.sparse_format, copy=self.copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if self.axis == 0: X = X.T if sparse.issparse(X): if self.norm == 'l1': inplace_csr_row_normalize_l1(X) elif self.norm == 'l2': inplace_csr_row_normalize_l2(X) elif self.norm == 'max': _, self.norms = min_max_axis(X, 1) else: if self.norm == 'l1': self.norms = np.abs(X).sum(axis=1) elif self.norm == 'l2': self.norms = row_norms(X) elif self.norm == 'max': self.norms = np.max(X, axis=1) self.norms = _handle_zeros_in_scale(self.norms, copy=False)
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False, shrink=0): """Scale input vectors individually to unit norm (vector length). Read more in the :ref:`User Guide <preprocessing_normalization>`. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1', 'l2', or 'max', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). return_norm : boolean, default False whether to return the computed norms Returns ------- X : {array-like, sparse matrix}, shape [n_samples, n_features] Normalized input X. norms : array, shape [n_samples] if axis=1 else [n_features] An array of norms along given axis for X. When X is sparse, a NotImplementedError will be raised for norm 'l1' or 'l2'. See also -------- Normalizer: Performs normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). Notes ----- For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`. """ if norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if axis == 0: X = X.T if sparse.issparse(X): if return_norm and norm in ('l1', 'l2'): raise NotImplementedError("return_norm=True is not implemented " "for sparse matrices with norm 'l1' " "or norm 'l2'") if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X, shrink) elif norm == 'max': _, norms = min_max_axis(X, 1) norms_elementwise = norms.repeat(np.diff(X.indptr)) mask = norms_elementwise != 0 X.data[mask] /= norms_elementwise[mask] else: if norm == 'l1': norms = np.abs(X).sum(axis=1) elif norm == 'l2': norms = row_norms(X) elif norm == 'max': norms = np.max(X, axis=1) norms = _handle_zeros_in_scale(norms, copy=False) X /= norms[:, np.newaxis] if axis == 0: X = X.T if return_norm: return X, norms else: return X
def csr_summ(x): mean, var = sparsefuncs.mean_variance_axis(x, 0) min_val, max_val = sparsefuncs.min_max_axis(x, 0) return np.hstack( [mean + 0.005, var + 0.005, min_val + 0.005, max_val + 0.005])
def CorrelationThreshold(X, threshold, kind): """Learn empirical variances from X. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training set to compute correlations. y : ignored Not used, present here for API consistency by convention. Returns ------- support_mask : Boolean array for feature selection """ if not (0.0 <= threshold <= 1.0): raise BFE.from_errors([{'0100': 'Threshold value must in [0.0, 1.0]'}]) if kind not in ('pearson', 'spearmanr'): raise BFE.from_errors([{'0100': "Kind must be 'pearson' or 'spearmanr"}]) if issparse(X) and kind != 'pearson': raise BFE.from_errors([{'0100': "Only pearson correlation is supported with 'sparse matrices'"}]) X = check_array(X, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32]) n_features = X.shape[1] if threshold == 1 or (1 in X.shape): support_mask = np.ones(n_features, dtype=np.bool) return support_mask # get constant features if issparse(X): mins, maxes = min_max_axis(X, axis=0) peak_to_peaks = maxes - mins constant_mask = np.isclose(peak_to_peaks, 0.0) # sparse correlation mu, sparse_var = mean_variance_axis(X, 0) X_corr = sparse_correlation(X, mu, ~constant_mask) else: peak_to_peaks = np.ptp(X, axis=0) constant_mask = np.isclose(peak_to_peaks, 0.0) if kind == 'pearson': X_corr = np.corrcoef(X, rowvar=False) else: # spearmanr X_corr, _ = spearmanr(X) # spearmanr returns scaler when comparing two columns if isinstance(X_corr, float): X_corr = np.array([[1, X_corr], [X_corr, 1]]) np.fabs(X_corr, out=X_corr) # Removes constant features from support_mask support_mask = np.ones(n_features, dtype=np.bool) upper_idx = np.triu_indices(n_features, 1) non_constant_features = n_features for i in np.flatnonzero(constant_mask): feat_remove_mask = np.logical_and(upper_idx[0] != i, upper_idx[1] != i) upper_idx = (upper_idx[0][feat_remove_mask], upper_idx[1][feat_remove_mask]) support_mask[i] = False non_constant_features -= 1 for _ in range(non_constant_features -1): max_idx = np.argmax(X_corr[upper_idx]) feat1, feat2 = upper_idx[0][max_idx], upper_idx[1][max_idx] cur_corr = X_corr[feat1, feat2] # max correlation is lower than threshold if cur_corr < threshold: break # Temporary remove both features to calculate the mean with other # features. One of the featuers will be selected. support_mask[[feat1, feat2]] = False # if there are no other features to compare, keep the feature with the most # variance if np.all(~support_mask): if issparse(X): # sparse precalculates variance for all features var = sparse_var[[feat1, feat2]] else: var = np.var(X[:, [feat1, feat2]], axis=0) print(feat1, feat2) if var[0] < var[1]: support_mask[feat2] = True else: support_mask[feat1] = True break # mean with other features feat1_mean = np.mean(X_corr[feat1, support_mask]) feat2_mean = np.mean(X_corr[feat2, support_mask]) # feature with lower mean is kept if feat1_mean < feat2_mean: support_mask[feat1] = True feat_to_remove = feat2 else: support_mask[feat2] = True feat_to_remove = feat1 # remove the removed feature from consideration upper_idx_to_keep = np.logical_and(upper_idx[0] != feat_to_remove, upper_idx[1] != feat_to_remove) upper_idx = (upper_idx[0][upper_idx_to_keep], upper_idx[1][upper_idx_to_keep]) return support_mask
def get_ranked_phrases(nlp, raw_documents, timestamps=None, *, include_verb_phrases=False, minlen=1, maxlen=8, n_jobs=_default_n_jobs, batch_size=_default_batch_size, stop_phrases=[], vectorizer='bngram', aggfunc='sum', **vectorizer_kws): """ Get phrases ranked by either TF-IDF (importance) score or BNgram (novelty) score. Parameters ---------- nlp : spacy.language.Language Spacy language model raw_documents : Iterable[str] An iterable which yields either str objects. timestamps : Iterable[str] timestamp of the documents. An iterable which yields datetime objects. Only used when `vectorizer='bngram'`. include_verb_phrases : bool, default=False Indicator to include verb phrases also. minlen : int, default=1 Minimum length of extracted multi-word phrases. Used for tokenizing the text. maxlen : int, default=8 Maximum length of extracted multi-word phrases. Used for tokenizing the text. n_jobs : int, default=-1 Number of processes to get noun phrases in parallel from documents. * -1: Use one process per available CPU cores * >0: Use `n_jobs` processes batch_size : int, default=1000 Batch size for tokenizing, tagging and extracting noun phrases. Use smaller batch sizes on large number of large texts and vice-versa. stop_phrases : List[str], default=[] List of phrases to remove. vectorizer : str, default='bngram' One of ('bngram', 'tfidf'). aggfunc : Union[str, callable, NoneType], default='sum' Function to aggregate over the scores per document for a single phrase to rank. One of ('sum', 'mean', 'max', 'median', 'median_ignore_0', callable that accepts sparse matrix, None). If None, this function will return the vectorized documents and the vectorizer directly. vectorizer_kws : dict Keyword arguments for TfidfVectorizer Returns ------- ranked_phrases : Union[pandas.DataFrame, Tuple[array[N, M], vectorizer]] If aggfunc is not None, returns the dataframe with the extracted n-gram / phrase and sorted descending by the aggregated bngram / td-idf scores, else returns the vectorized documents (where N=len(raw_documents) and M=len(phrases)) and the vectorizer object, """ assert vectorizer in ('bngram', 'tfidf') stop_phrases = set(stop_phrases) # get candidate phrases nlp.add_pipe( NounPhraseMatcher(lowercase=True, lemmatize=True, include_verb_phrases=include_verb_phrases, minlen=minlen, maxlen=maxlen)) # extract phrases def process_chunk(texts): return list(nlp.pipe(texts)) logger.info('Tokenizing, tagging and extracting noun phrases ' 'per documents with spacy') n_jobs = psutil.cpu_count(logical=False)\ if n_jobs == -1 else n_jobs raw_documents = list( nlp.pipe(raw_documents, batch_size=batch_size, n_process=n_jobs)) # vectorize the texts if 'norm' in vectorizer_kws and aggfunc is not None: warnings.warn( "'vectorizer_kws' should not contain 'norm'. " "'vectorizer_kws['norm']' will be replaced.", UserWarning) vectorizer_kws['norm'] = None if 'analyzer' in vectorizer_kws: warnings.warn( "'vectorizer_kws' should not contain 'analyzer'. " "'vectorizer_kws['analyzer']' will be replaced.", UserWarning) vectorizer_kws['analyzer'] = lambda doc: [ p for p in doc._.noun_phrases if p not in stop_phrases ] if vectorizer == 'bngram': if timestamps is None: raise ValueError( 'Parameter `timestamps` cannot be None if `vectorizer=bngram`.' ) vectorizer = BngramsVectorizer(**vectorizer_kws) logger.info('Vectorizing documents with BNgrams') X = vectorizer.fit_transform(raw_documents, timestamps) elif vectorizer == 'tfidf': vectorizer = TfidfVectorizer(**vectorizer_kws) logger.info('Vectorizing documents with TF-IDF') X = vectorizer.fit_transform(raw_documents) else: raise ValueError(f'Unknown vectorizer={vectorizer} given.') logger.info('Scoring phrases') if aggfunc == 'sum': scores = np.array(X.tocsc().sum(0))[0] elif aggfunc == 'mean': scores = np.array(X.tocsc().mean(0))[0] elif aggfunc == 'max': scores = min_max_axis(X.tocsc(), axis=0, ignore_nan=True)[1] elif aggfunc == 'median': scores = csc_median_axis_0(X.tocsc()) elif aggfunc == 'median_ignore_0': scores = _get_median(X.tocsc(), 0) elif callable(aggfunc): scores = aggfunc(X.tocsc()) elif aggfunc is None: return X, vectorizer else: raise ValueError(f'Unknown method: {aggfunc}') logger.info('Rank phrases based on score') ranked_phrases = pd.DataFrame(list( zip(vectorizer.get_feature_names(), scores)), columns=['phrase', 'score']) ranked_phrases = ranked_phrases\ .sort_values('score', ascending=False)\ .reset_index(drop=True) return ranked_phrases