class classifier(): """ an abstract class that models a classifier """ __metaclass__ = abc.ABCMeta def __init__(self, param_grid=None, n_folds=None, n_class_samples=None, n_test_samples=None, n_tests=1, name="classifier"): self.name = name self.param_grid = param_grid self.best_param_set = None self.n_folds = n_folds # the number of validation or test samples per class self.n_test_samples = n_test_samples # the number of training samples per class self.n_class_samples = n_class_samples self.n_tests = n_tests def fit(self, X, y): self.__call__(X, y) def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y) def cross_validate(self, X, y): print "fitting {} to the training set".format(self.name) if self.param_grid is not None: param_sets = list(ParameterGrid(self.param_grid)) n_param_sets = len(param_sets) param_scores = [] for j, param_set in enumerate(param_sets): print "--------------" print "training the classifier..." print "parameter set:" for k, v in param_set.iteritems(): print "{}:{}".format(k, v) param_score = self.evaluate(X, y, param_set=param_set) param_scores.append(param_score) p = np.argmax(np.array(param_scores)) self.best_param_set = param_sets[p] print "best parameter set", self.best_param_set print "best score:", param_scores[p] else: score = self.evaluate(X, y) def evaluate(self, X, y, param_set=None): """ evaluate the performance of the classifier trained with the parameters in <param_set> """ cv_scores = [] # avg_class_accs = [] for train_index, test_index in self.folds: X_train, X_test = X[:, train_index], X[:, test_index] y_train, y_test = y[train_index], y[test_index] self.train(X_train, y_train, param_set=param_set) y_pred = self.predict(X_test) y_pred = np.array(y_pred) class_acc = class_accuracy(y_pred, y_test) # avg_class_acc = avg_class_accuracy(y_pred,y_test) cv_scores.append(class_acc) # avg_class_accs.append(avg_class_acc) print "average class accuracy:", avg_class_accuracy(y_pred, y_test) avg_cv_score = np.mean(cv_scores) print "accuracy:", avg_cv_score return avg_cv_score @abc.abstractmethod def train(self, X_train, y_train, param_set=None): """train the classifier""" raise NotImplementedError @abc.abstractmethod def predict(self, X_test): """predict labels in X_test""" raise NotImplementedError
class classifier(): """ an abstract class that models a classifier """ __metaclass__ = abc.ABCMeta def __init__(self, param_grid=None, n_folds=None, n_class_samples=None, n_test_samples=None, n_tests=1, name="classifier"): self.name = name self.param_grid = param_grid self.best_param_set = None self.n_folds = n_folds # the number of validation or test samples per class self.n_test_samples = n_test_samples # the number of training samples per class self.n_class_samples = n_class_samples self.n_tests = n_tests def fit(self, X, y): self.__call__(X, y) def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y) def cross_validate(self, X, y): print "fitting {} to the training set".format(self.name) if self.param_grid is not None: param_sets = list(ParameterGrid(self.param_grid)) n_param_sets = len(param_sets) param_scores = [] for j, param_set in enumerate(param_sets): print "--------------" print "training the classifier..." print "parameter set:" for k, v in param_set.iteritems(): print "{}:{}".format(k, v) param_score = self.evaluate(X, y, param_set=param_set) param_scores.append(param_score) p = np.argmax(np.array(param_scores)) self.best_param_set = param_sets[p] print "best parameter set", self.best_param_set print "best score:", param_scores[p] else: score = self.evaluate(X, y) def evaluate(self, X, y, param_set=None): """ evaluate the performance of the classifier trained with the parameters in <param_set> """ cv_scores = [] # avg_class_accs = [] for train_index, test_index in self.folds: X_train, X_test = X[:, train_index], X[:, test_index] y_train, y_test = y[train_index], y[test_index] self.train(X_train, y_train, param_set=param_set) y_pred = self.predict(X_test) y_pred = np.array(y_pred) n_correct = np.sum(y_test == y_pred) class_acc = class_accuracy(y_pred, y_test) # avg_class_acc = avg_class_accuracy(y_pred,y_test) cv_scores.append(class_acc) # avg_class_accs.append(avg_class_acc) print "average class accuracy:", avg_class_accuracy(y_pred, y_test) avg_cv_score = np.mean(cv_scores) print "accuracy:", avg_cv_score return avg_cv_score @abc.abstractmethod def train(self, X_train, y_train, param_set=None): '''train the classifier''' raise NotImplementedError @abc.abstractmethod def predict(self, X_test): '''test the classifier''' raise NotImplementedError