def _prepare(self, data): dct = super(SelectNgramCounts, self)._prepare(data) if self.train_only: y = get_single_column(self.target.create(self.context)).reindex(self.context.train_index) x = data.reindex(self.context.train_index) else: y = get_single_column(self.target.create(self.context)) x = data cols = self.select(x, y) return cols, dct
def _prepare(self, prep_data): prep_data = get_single_column(prep_data) docs = list(prep_data) logging.debug(docs[:10]) dct = self.dictionary.get_dict(docs) logging.debug(dct) return dct
def _apply(self, data, fitted_feature): dct = fitted_feature.prepped_data data = get_single_column(data) docs = [dct.doc2bow(d) for d in data] ids = sorted(dct.keys()) df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index) df.columns = ['%s_%s' % (dct[i], data.name) for i in ids] df = df.fillna(0) if self.bool_: df = df.astype(bool).astype(int) return df
def _combine_apply(self, datas, fitted_feature): datas = [get_single_column(d) for d in datas] d = [] for x in zip(*datas): d.append(self.sep.join(x)) return DataFrame(d, columns=['_'.join([c.name for c in datas])])