def __vectorize(self, data): """\ Train vectorization and subsequently vectorize. Accepts a DataSet or a list of dictionaries to be vectorized. """ # no vectorization performed, only converted to matrix if self.vectorizer is None: if not isinstance(data, DataSet): data_set = DataSet() data_set.load_from_dict(data) data = data_set data.match_headers(self.data_headers, add_values=True) # TODO pre-filtering here? return data.as_bunch(target=self.class_attr, select_attrib=self.select_attr).data # vectorization needed: converted to dictionary # and passed to the vectorizer if isinstance(data, DataSet): data = data.as_dict(select_attrib=self.select_attr, mask_attrib=self.class_attr) else: data = [{key: val for key, val in inst.items() if key != self.class_attr and key in self.select_attr} for inst in data] # pre-filter attributes if filter_attr is set if self.filter_attr: data = [{key: val for key, val in inst.items() if self.filter_attr(key, val)} for inst in data] if not self.vectorizer_trained: self.vectorizer.fit(data) self.vectorizer_trained = True return self.vectorizer.transform(data).tocsr()
def __vectorize(self, data): """\ Train vectorization and subsequently vectorize. Accepts a DataSet or a list of dictionaries to be vectorized. """ # no vectorization performed, only converted to matrix if self.vectorizer is None: if not isinstance(data, DataSet): data_set = DataSet() data_set.load_from_dict(data) data = data_set data.match_headers(self.data_headers, add_values=True) # TODO pre-filtering here? return data.as_bunch(target=self.class_attr, select_attrib=self.select_attr).data # vectorization needed: converted to dictionary # and passed to the vectorizer if isinstance(data, DataSet): data = data.as_dict(select_attrib=self.select_attr, mask_attrib=self.class_attr) else: data = [{ key: val for key, val in inst.items() if key != self.class_attr and key in self.select_attr } for inst in data] # pre-filter attributes if filter_attr is set if self.filter_attr: data = [{ key: val for key, val in inst.items() if self.filter_attr(key, val) } for inst in data] if not self.vectorizer_trained: self.vectorizer.fit(data) self.vectorizer_trained = True return self.vectorizer.transform(data).tocsr()