def get_max_squared_sum(X): if sparse.issparse(X): X = X.tocsr() from sklearn.utils import sparsefuncs_fast return sparsefuncs_fast.csr_row_norms(X).max() else: return np.sum(X**2, axis=1).max()
def get_max_squared_sum(X): if sparse.issparse(X): X = X.tocsr() from sklearn.utils import sparsefuncs_fast return sparsefuncs_fast.csr_row_norms(X).max() else: return np.sum(X ** 2, axis=1).max()
def test_smart_tfidf_transformer(scheme): tf = CountVectorizer().fit_transform(documents) estimator = SmartTfidfTransformer(weighting=scheme) X = estimator.fit_transform(tf) scheme_t, scheme_d, scheme_n = _validate_smart_notation(scheme) if scheme_d not in 'dp': # the resulting document term matrix should be positive # (unless we use probabilistic idf weighting) assert (X.A >= 0).all() # norm cannot be zero X_norm = csr_row_norms(X) assert (X_norm > 0).all() X_ref = None if scheme == 'nnn': X_ref = X elif scheme == 'nnc': X_ref = TfidfVectorizer(use_idf=False, smooth_idf=False).fit_transform(documents) elif scheme == 'ntc': X_ref = TfidfVectorizer(use_idf=True, smooth_idf=False).fit_transform(documents) elif scheme == 'lnn': X_ref = TfidfVectorizer(use_idf=False, sublinear_tf=True, smooth_idf=False, norm=None).fit_transform(documents) elif scheme == 'ltc': X_ref = TfidfVectorizer(use_idf=True, sublinear_tf=True, smooth_idf=False).fit_transform(documents) elif scheme == 'ltl': X_ref = TfidfVectorizer(use_idf=True, sublinear_tf=True, smooth_idf=False, norm='l1').fit_transform(documents) if X_ref is not None: assert_allclose(X.A, X_ref.A, rtol=1e-7, atol=1e-6) assert len(estimator.dl_) == tf.shape[0] assert len(estimator.du_) == tf.shape[0] if scheme_d in ['tsp']: assert len(estimator.df_) == tf.shape[1] X_2 = SmartTfidfTransformer(weighting=scheme).fit(tf).transform(tf) assert_allclose(X.A, X_2.A, rtol=1e-6, atol=1e-6) if scheme_d in 'stp': assert estimator.df_ is not None sl = slice(2) tf_w_sl = estimator.transform(tf[sl]) assert_allclose(X[sl].A, tf_w_sl.A)
def test_csr_row_norms(dtype): # checks that csr_row_norms returns the same output as # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype. X = sp.random(100, 10, format='csr', dtype=dtype, random_state=42) scipy_norms = sp.linalg.norm(X, axis=1)**2 norms = csr_row_norms(X) assert norms.dtype == dtype rtol = 1e-6 if dtype == np.float32 else 1e-7 assert_allclose(norms, scipy_norms, rtol=rtol)
def row_norms(X, squared=False): """Row-wise (squared) Euclidean norm of X. Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse matrices and does not create an X.shape-sized temporary. Performs no input validation. """ if issparse(X): if not isinstance(X, csr_matrix): X = csr_matrix(X) norms = csr_row_norms(X) else: norms = np.einsum('ij,ij->i', X, X) if not squared: np.sqrt(norms, norms) return norms
def _smart_tfidf(tf, weighting, df=None, df_n_samples=None, norm_alpha=0.75, norm_pivot=None, return_pivot=False): """ Apply TF-IDF feature weighting using the SMART notation. Parameters ---------- df : sparse csr array the term frequency matrix (n_documents, n_features) weighting : str, default='nnc' the SMART notation for document term weighting and normalization. In the form [nlabL][ntspd][nclu][p] , see https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System df : array, shape=[n_features], optional precomputed inverse document frequency matrix (n_features,). If not provided, it will be recomputed if necessary. Both df and df_n must be provided at the same time. df_n_samples : float, default=None when using a inverse document frequency matrix, the number of documents that were used to compute the df. Both df and df_n must be provided at the same time. norm_alpha : float, default=0.75 the alpha parameter in the pivoted normalization. Only used when weighting='???p'. norm_pivot : float, default=None the pivot value used for the normalization. If not provided, and weighting='???p', it is computed as the mean of the norm(tf*idf). return_pivot : bool, default=False return the computed norm_pivot Returns ------- X : sparse csr array the weighted term frequency matrix norm_pivot : flot return the norm pivot (only when return_pivot=True) References ---------- .. [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `"Document and query weighting schemes" <https://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html>`_ , 2008 .. [Singhal1996] A. Singhal, C. Buckley, and M. Mitra. `"Pivoted document length normalization." <https://ecommons.cornell.edu/bitstream/handle/1813/7217/95-1560.pdf?sequence=1>`_ , 1996 """ # noqa tf = check_array(tf, ['csr']) if (df is None) != (df_n_samples is None): raise ValueError(('df={} and df_n_samples={}, while both should be ' 'either provided or not provided') .format(df is None, df_n_samples)) if df is not None: df = check_array(df, ensure_2d=False) if df.shape[0] != tf.shape[-1]: raise ValueError(('df array provided with n_features={} ,' 'while in the tf array n_features={}') .format(df.shape[0], tf.shape[1])) if not 0 <= norm_alpha <= 1: raise ValueError('norm_alpha={} not in [0, 1]'.format(norm_alpha)) n_samples, n_features = tf.shape if df_n_samples is None: df_n_samples = n_samples scheme_t, scheme_d, scheme_n = _validate_smart_notation(weighting) X = tf # term weighting if scheme_t == 'n': pass elif scheme_t == 'l': X.data = 1 + np.log(tf.data) elif scheme_t == 'd': X.data = 1 + np.log(1 + np.log(tf.data)) elif scheme_t == 'a': max_tf = np.squeeze(tf.max(axis=1).A) # if max_tf is zero, the tf are going to be all zero anyway # so we set it to 1 in order to prevent overflows max_tf[max_tf == 0] = 1 _max_tf_diag = sp.spdiags(1. / max_tf, diags=0, m=n_samples, n=n_samples, format='csr') X = 0.5 * _max_tf_diag.dot(tf) X.data += 0.5 elif scheme_t == 'b': X.data = tf.data.astype('bool').astype('int') elif scheme_t == 'L': mean_tf = _mean_csr_nonzero_axis1(tf) # if mean_tf is zero, the tf are going to be all zero anyway # so we set it to 1 in order to prevent overflows mean_tf[mean_tf == 0] = 1.0 mean_tf = (1 + np.log(mean_tf)) _mean_tf_diag = sp.spdiags(1./mean_tf, diags=0, m=n_samples, n=n_samples, format='csr') X.data = (1 + np.log(tf.data)) X = _mean_tf_diag.dot(X) else: raise ValueError # document weighting if scheme_d == 'n': pass elif scheme_d in 'tpsd': if df is None: df = _document_frequency(tf) if scheme_d == 't': idf = np.log(float(df_n_samples) / df) + 1.0 elif scheme_d == 's': idf = np.log(float(df_n_samples + 1) / (df + 1)) + 1.0 elif scheme_d == 'p': with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="divide by zero encountered in log", # noqa category=RuntimeWarning) idf = np.log((float(df_n_samples) - df)/df) elif scheme_d == 'd': idf = np.log((float(df_n_samples) + 1 - df)/(df + 1)) _idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') X = X.dot(_idf_diag) else: raise ValueError # normalization if scheme_n == 'n': pass elif scheme_n == 'c': with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DataConversionWarning) X = normalize(X, norm="l2", copy=False) elif scheme_n == 'l': with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DataConversionWarning) X = normalize(X, norm="l1", copy=False) elif scheme_n == 'u': X_norm = np.diff(X.indptr) X_norm[X_norm == 0] = 1. # empty documents (with a zero norm) don't need to be normalized _diag_norm = sp.spdiags(1./X_norm, diags=0, m=n_samples, n=n_samples, format='csr') X = _diag_norm.dot(X) elif scheme_n in ['cp', 'lp', 'up']: if scheme_n == 'cp': X_norm = np.sqrt(csr_row_norms(X)) elif scheme_n == 'lp': X_data = X.data.copy() X.data = np.abs(X.data) X_norm = np.squeeze(X.sum(axis=1).A) X.data = X_data elif scheme_n == 'up': X_norm = np.diff(X.indptr) if norm_pivot is None: norm_pivot = X_norm.mean() # empty documents (with a zero norm) don't need to be normalized X_norm[X_norm == 0] = 1. pivoted_norm = (1 - norm_alpha)*norm_pivot + norm_alpha*X_norm _diag_pivoted_norm = sp.spdiags(1./pivoted_norm, diags=0, m=n_samples, n=n_samples, format='csr') X = _diag_pivoted_norm.dot(X) else: raise ValueError if return_pivot: return X, norm_pivot else: return X