def transform(self, X, copy=True): """Transform a count matrix to a tf or tf-idf representation Parameters ---------- X : array-like of (n_samples, n_features) A matrix of term/token counts copy : bool, default=True Whether to copy X and operate on the copy or perform in-place operations. Returns ------- vectors : array-like of shape (n_samples, n_features) """ if copy: X = X.copy() dtype = _get_dtype(X) X = self._convert_to_csr(X, dtype) if X.dtype != dtype: X = X.astype(dtype) n_samples, n_features = X.shape if self.sublinear_tf: cp.log(X.data, X.data) X.data += 1 if self.use_idf: self._check_is_idf_fitted() expected_n_features = self._idf_diag.shape[0] if n_features != expected_n_features: raise ValueError("Input has n_features=%d while the model" " has been trained with n_features=%d" % (n_features, expected_n_features)) csr_diag_mul(X, self._idf_diag, inplace=True) if self.norm: if self.norm == 'l1': csr_row_normalize_l1(X, inplace=True) elif self.norm == 'l2': csr_row_normalize_l2(X, inplace=True) return X
def transform(self, raw_documents): """ Transform documents to document-term matrix. Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor. Parameters ---------- raw_documents : cudf.Series A Series of string documents Returns ------- X : sparse CuPy CSR matrix of shape (n_samples, n_features) Document-term matrix. """ docs = self._preprocess(raw_documents) del raw_documents n_doc = len(docs) tokenized_df = self._create_tokenized_df(docs) del docs count_df = self._count_hash(tokenized_df) del tokenized_df empty_doc_ids = self._compute_empty_doc_ids(count_df, n_doc) X = create_csr_matrix_from_count_df(count_df, empty_doc_ids, n_doc, self.n_features, dtype=self.dtype) if self.binary: X.data.fill(1) if self.norm: if self.norm == "l1": csr_row_normalize_l1(X, inplace=True) elif self.norm == "l2": csr_row_normalize_l2(X, inplace=True) return X