def fit(self, X, y=None): words_counter = Counter() def update_word_counts(element): words = self.token_to_words(element) words_counter.update(words) apply_map(X, update_word_counts) common_words = {word for word, freq in words_counter.most_common(self.n)} rare_words = {word for word, freq in words_counter.most_common() if freq <= self.min_freq} filter_words = common_words.union(rare_words) self._filter_words = filter_words return self
def _test_apply_map(self, inplace): value_dict = { "column1": [1, 2, 3, 4, 5], "column2": [4, 5, 6, 7, 8], } array_dict = { "column1": [[0, 1], [2, 3, 4], [4, 5]], "column2": [[3, 4], [5, 6], [2, 1, 9]], } df = pd.DataFrame(value_dict) array_df = pd.DataFrame(array_dict) series = pd.Series(value_dict["column1"]) array_series = pd.Series(array_dict["column1"]) value_list = [1, 2, 3, 4, 5] array_list = [[0, 1], [2, 3, 2], [4, 5, 1]] multi_array_list = [[[3, 1], [2, 3, 1]], [[4, 5], [4, 5]]] test_dict = { "value": [df, series, value_dict, value_list, None], "array": [array_df, array_series, array_dict, array_list, multi_array_list] } kinds = ["DataFrame", "Series", "dict", "list", "multi-array"] for k in test_dict: if k == "value": func = lambda x: x * 2 else: func = lambda x: sum(x) for _k, d in zip(kinds, test_dict[k]): if d is None: continue print("{}-{}".format(k, _k)) _d = copy.deepcopy(d) result = apply_map(_d, func, inplace) if inplace: result = _d if _k == "DataFrame": print(result) self.assertEqual(tuple(self.flatten(result)), tuple(map(func, self.flatten(d)))) elif _k == "Series": self.assertEqual(tuple(self.flatten(result)), tuple(map(func, self.flatten(d)))) elif _k == "dict": for kx in result: self.assertEqual(tuple(self.flatten(result[kx])), tuple(map(func, self.flatten(d[kx])))) elif _k == "list": print("{} => {}".format(d, result)) self.assertEqual(tuple(self.flatten(result)), tuple(map(func, self.flatten(d))))
def fit(self, X, y=None): vocab = Counter() length = len(X) if isinstance(X, dict): length = len(list(X.values)[0]) def update_vocab(element): words = self.token_to_words(element) if self.ignore_blank: words = [w for w in words if w.strip()] vocab.update(words) apply_map(X, update_vocab) reserved = [self._padding, self._unknown, self._begin_of_sequence, self._end_of_sequence] reserved = [r for r in reserved if r] # filter no setting token selected = [] if self.vocab_size > 0: for term, count in vocab.most_common(): if len(selected) < self.vocab_size: selected.append(term) else: break else: min_limit = (self.min_df if isinstance(self.min_df, numbers.Integral) else self.min_df * length) max_limit = (self.max_df if isinstance(self.max_df, numbers.Integral) else self.max_df * length) for term, count in vocab.most_common(): if count < min_limit or count > max_limit: continue else: selected.append(term) reserved = [r for r in reserved if r not in selected] self._vocab = reserved + selected return self
def transform(self, X): return apply_map(X, self.apply, inplace=(not self.copy))
def inverse_transform(self, X): if len(self._vocab) == 0: raise Exception("Vocabulary has not made yet. Plase execute fit.") return apply_map(X, self.inverse, inplace=self.copy)
def transform(self, X): _X = apply_map(X, self.tokenizer.tokenize, self.copy) return _X