def remove_categories(self, categories, ignore_absences=False): ''' Non destructive category removal. Parameters ---------- categories : list list of categories to remove ignore_absences : bool, False by default if categories does not appear, don't raise an error, just move on. Returns ------- TermDocMatrix, new object with categories removed. ''' idx_to_delete_list = [] existing_categories = set(self.get_categories()) for category in categories: if category not in existing_categories: if not ignore_absences: raise KeyError('Category %s not found' % (category)) continue idx_to_delete_list.append( self._category_idx_store.getidx(category)) new_category_idx_store = self._category_idx_store.batch_delete_idx( idx_to_delete_list) columns_to_delete = np.nonzero(np.isin(self._y, idx_to_delete_list)) new_X = delete_columns(self._X.T, columns_to_delete).T new_mX = delete_columns(self._mX.T, columns_to_delete).T intermediate_y = self._y[~np.isin(self._y, idx_to_delete_list)] old_y_to_new_y = [ self._category_idx_store.getidx(x) for x in new_category_idx_store._i2val ] new_y = np.array([ old_y_to_new_y.index(i) if i in old_y_to_new_y else None for i in range(intermediate_y.max() + 1) ])[intermediate_y] new_metadata_idx_store = self._metadata_idx_store if len(self._metadata_idx_store): meta_idx_to_delete = np.nonzero(new_mX.sum(axis=0).A1 == 0)[0] new_metadata_idx_store = self._metadata_idx_store.batch_delete_idx( meta_idx_to_delete) term_idx_to_delete = np.nonzero(new_X.sum(axis=0).A1 == 0)[0] new_term_idx_store = self._term_idx_store.batch_delete_idx( term_idx_to_delete) new_X = delete_columns(new_X, term_idx_to_delete) term_doc_mat_to_ret = self._make_new_term_doc_matrix( new_X, new_mX, new_y, new_term_idx_store, new_category_idx_store, new_metadata_idx_store, ~np.isin(self._y, idx_to_delete_list)) return term_doc_mat_to_ret
def remove_terms_by_indices(self, idx_to_delete_list): new_term_idx_store = self._term_idx_store.batch_delete_idx(idx_to_delete_list) new_X = delete_columns(self._X, idx_to_delete_list) return self._make_new_term_doc_matrix(new_X, self._mX, self._y, new_term_idx_store, self._category_idx_store, self._metadata_idx_store, self._y == self._y)
def _get_X_after_delete_terms(self, idx_to_delete_list): new_term_idx_store = self._term_idx_store.batch_delete_idx(idx_to_delete_list) new_X = delete_columns(self._X, idx_to_delete_list) return new_X, new_term_idx_store
def _get_X_after_delete_terms(self, idx_to_delete_list, non_text=False): new_term_idx_store = self._get_relevant_idx_store( non_text).batch_delete_idx(idx_to_delete_list) new_X = delete_columns(self._get_relevant_X(non_text), idx_to_delete_list) return new_X, new_term_idx_store
def remove_terms_by_indices(self, idx_to_delete_list): new_term_idx_store = self._term_idx_store.batch_delete_idx(idx_to_delete_list) new_X = delete_columns(self._X, idx_to_delete_list) return self._term_doc_matrix_with_new_X(new_X, new_term_idx_store)