def __vectorize(self, data): """\ Train vectorization and subsequently vectorize. Accepts a DataSet or a list of dictionaries to be vectorized. """ # no vectorization performed, only converted to matrix if self.vectorizer is None: if not isinstance(data, DataSet): data_set = DataSet() data_set.load_from_dict(data) data = data_set data.match_headers(self.data_headers, add_values=True) # TODO pre-filtering here? return data.as_bunch(target=self.class_attr, select_attrib=self.select_attr).data # vectorization needed: converted to dictionary # and passed to the vectorizer if isinstance(data, DataSet): data = data.as_dict(select_attrib=self.select_attr, mask_attrib=self.class_attr) else: data = [{key: val for key, val in inst.items() if key != self.class_attr and key in self.select_attr} for inst in data] # pre-filter attributes if filter_attr is set if self.filter_attr: data = [{key: val for key, val in inst.items() if self.filter_attr(key, val)} for inst in data] if not self.vectorizer_trained: self.vectorizer.fit(data) self.vectorizer_trained = True return self.vectorizer.transform(data).tocsr()
def load_training_set(self, filename, encoding='UTF-8'): """\ Load the given training data set into memory and strip it if configured to via the train_part parameter. """ log_info('Loading training data set from ' + str(filename) + '...') train = DataSet() train.load_from_arff(filename, encoding) if self.train_part < 1: train = train.subset(0, int(round(self.train_part * len(train))), copy=False) return train
def __vectorize(self, data): """\ Train vectorization and subsequently vectorize. Accepts a DataSet or a list of dictionaries to be vectorized. """ # no vectorization performed, only converted to matrix if self.vectorizer is None: if not isinstance(data, DataSet): data_set = DataSet() data_set.load_from_dict(data) data = data_set data.match_headers(self.data_headers, add_values=True) # TODO pre-filtering here? return data.as_bunch(target=self.class_attr, select_attrib=self.select_attr).data # vectorization needed: converted to dictionary # and passed to the vectorizer if isinstance(data, DataSet): data = data.as_dict(select_attrib=self.select_attr, mask_attrib=self.class_attr) else: data = [{ key: val for key, val in inst.items() if key != self.class_attr and key in self.select_attr } for inst in data] # pre-filter attributes if filter_attr is set if self.filter_attr: data = [{ key: val for key, val in inst.items() if self.filter_attr(key, val) } for inst in data] if not self.vectorizer_trained: self.vectorizer.fit(data) self.vectorizer_trained = True return self.vectorizer.transform(data).tocsr()
def evaluate(self, test_file, encoding='UTF-8', classif_file=None): """\ Evaluate on the given test data file. Return accuracy. If classif_file is set, save the classification results to this file. """ test = DataSet() test.load_from_arff(test_file, encoding) values = self.classify(test) golden = self.get_classes(test, dtype=None) if classif_file is not None: classif = DataSet() classif.load_from_vect(test.get_attrib(self.class_attr), values) classif.rename_attrib(self.class_attr, self.PREDICTED) test.merge(classif) test.save_to_arff(classif_file, encoding) return zero_one_score(golden, values)