def replace_random_tokens(self, n_samples, replacement='', random_state=None, min_replace=1, max_replace=1.0, group_size=1): """ Return a list of ``(text, replaced_count, mask)`` tuples with n_samples versions of text with some words replaced. By default words are replaced with '', i.e. removed. """ n_tokens = len(self.tokens) indices = np.arange(n_tokens) if not n_tokens: nomask = np.array([], dtype=int) return [('', 0, nomask)] * n_samples min_replace, max_replace = self._get_min_max(min_replace, max_replace, n_tokens) rng = check_random_state(random_state) replace_sizes = rng.randint(low=min_replace, high=max_replace + 1, size=n_samples) res = [] for num_to_replace in replace_sizes: idx_to_replace = rng.choice(indices, num_to_replace, replace=False) idx_to_replace = np.array([idx_to_replace] + [ idx_to_replace + shift for shift in range(1, group_size) ]).ravel() padded_size = n_tokens + group_size - 1 mask = indices_to_bool_mask(idx_to_replace, padded_size)[:n_tokens] s = self.split.masked(mask, replacement) res.append((s.text, num_to_replace, mask)) return res
def replace_random_tokens_bow(self, n_samples, replacement='', random_state=None, min_replace=1, max_replace=1.0): """ Return a list of ``(text, replaced_words_count, mask)`` tuples with n_samples versions of text with some words replaced. If a word is replaced, all duplicate words are also replaced from the text. By default words are replaced with '', i.e. removed. """ if not self.vocab: nomask = np.array([], dtype=int) return [('', 0, nomask)] * n_samples min_replace, max_replace = self._get_min_max(min_replace, max_replace, len(self.vocab)) rng = check_random_state(random_state) replace_sizes = rng.randint(low=min_replace, high=max_replace + 1, size=n_samples) res = [] for num_to_replace in replace_sizes: tokens_to_replace = set(rng.choice(self.vocab, num_to_replace, replace=False)) idx_to_replace = [idx for idx, token in enumerate(self.tokens) if token in tokens_to_replace] mask = indices_to_bool_mask(idx_to_replace, len(self.tokens)) s = self.split.masked(idx_to_replace, replacement) res.append((s.text, num_to_replace, mask)) return res