def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # obtain the K nearest neighbors in the validation set [idx] = self.knn.kneighbors(x, n_neighbors=self.K, return_distance=False) neighbors_X = self.Xval[idx] # k neighbors neighbors_y = self.yval[idx] # k neighbors target # pool_output (sample, classifier_output) pool_output = np.zeros((neighbors_X.shape[0], len(ensemble))) for i, clf in enumerate(ensemble.classifiers): pool_output[:, i] = clf.predict(neighbors_X) x_outputs = [ ensemble.classifiers[j].predict(x) for j in range(len(ensemble)) ] x_outputs = np.asarray(x_outputs).flatten() scores = np.zeros(len(ensemble)) for j in range(pool_output.shape[1]): # get correctly classified samples mask_classified_correctly = pool_output[:, j] == neighbors_y # get classified samples with the same class as 'x' mask_classified_same_class = (pool_output[:, j] == x_outputs[j]) # get correctly classified samples with the same class as 'x' mask = mask_classified_correctly * mask_classified_same_class # calculate score scores[j] = float( sum(mask)) / (sum(mask_classified_same_class) + 10e-24) return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None mcb_x = ensemble.output(x, mode='labels')[0, :] # intialize variables # the the indexes of the KNN of x [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] mcb_v = ensemble.output(X, mode='labels') idx = [] for i in range(X.shape[0]): sim = np.mean(mcb_x == mcb_v[i, :]) if sim > self.similarity_threshold: idx = idx + [i] if len(idx) == 0: idx = np.arange(X.shape[0]) scores = [clf.score(X[idx], y[idx]) for clf in ensemble.classifiers] scores = np.array(scores) # if best classifier is significantly better # use best_classifier best_i = np.argmax(scores) best_j_score = np.max(scores[np.arange(len(scores)) != best_i]) if scores[best_i] - scores[best_j] >= self.significance_threshold: best_classifier = ensemble.classifiers[best_i] return Ensemble(classifiers=[best_classifier]), None return Ensemble(classifiers=ensemble.classifiers), None
class BaggingSK(PoolGenerator): """" This class should not be used, use brew.generation.bagging.Bagging instead. """ def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.sk_bagging.fit(X, y) self.ensemble.add_classifiers(self.sk_bagging.estimators_) # self.classes_ = set(y) def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return
class Bagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.ensemble = None self.combiner = Combiner(rule=combination_rule) def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # obtain the K nearest neighbors in the validation set [idx] = self.knn.kneighbors(x, return_distance=False) neighbors_X = self.Xval[idx] # k neighbors neighbors_y = self.yval[idx] # k neighbors target # pool_output (sample, classifier_output) pool_output = np.zeros((neighbors_X.shape[0], len(ensemble))) for i, clf in enumerate(ensemble.classifiers): pool_output[:, i] = clf.predict(neighbors_X) x_outputs = [ ensemble.classifiers[j].predict(x) for j in range(len(ensemble)) ] x_outputs = np.asarray(x_outputs).flatten() d = {} scores = np.zeros(len(ensemble)) for j in range(pool_output.shape[1]): # get correctly classified samples mask_classified_correctly = pool_output[:, j] == neighbors_y # get classified samples with the same class as 'x' mask_classified_same_class = (pool_output[:, j] == x_outputs[j]) # get correctly classified samples with the same class as 'x' mask = mask_classified_correctly * mask_classified_same_class # calculate score scores[j] = float( sum(mask)) / (sum(mask_classified_same_class) + 10e-24) d[str(scores[j])] = d[str(scores[j])] + [j] if str( scores[j]) in d else [j] best_scores = sorted([float(k) for k in list(d.keys())], reverse=True) options = None for j, score in enumerate(best_scores): pred = [x_outputs[i] for i in d[str(score)]] pred = np.asarray(pred).flatten() bincount = np.bincount(pred.astype(int)) if options is not None: for i in range(len(bincount)): bincount[i] = bincount[i] if i in options else 0 imx = np.argmax(bincount) votes = np.argwhere(bincount == bincount[imx]).flatten() count = len(votes) if count == 1: ens = Ensemble([ensemble.classifiers[np.argmax(pred == imx)]]) return ens, None elif options is None: options = votes return Ensemble([ensemble.classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] scores = np.asarray([clf.score(X, y) for clf in classifiers]) return Ensemble([classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): ensemble_mask = None neighbors_X, neighbors_y = self.get_neighbors(x) pool_output = ensemble.output(neighbors_X, mode='labels') # gradually decrease neighborhood size if no # classifier predicts ALL the neighbors correctly for i in range(self.K, 0, -1): pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i], np.all) # if at least one classifier gets all neighbors right if pool_mask is not None: ensemble_mask = pool_mask break # if NO classifiers get the nearest neighbor correctly if ensemble_mask is None: if self.v2007: # Increase neighborhood until one classifier # gets at least ONE (i.e. ANY) neighbors correctly. # Starts with 2 because mask_all with k=1 is # the same as mask_any with k=1 for i in range(2, self.K + 1): pool_mask = _get_pool_mask(pool_output[:i], neighbors_y[:i], np.any) if pool_mask is not None: ensemble_mask = pool_mask break [selected_idx] = np.where(ensemble_mask) if selected_idx.size > 0: pool = Ensemble( classifiers=[ensemble.classifiers[i] for i in selected_idx]) else: # use all classifiers # pool = ensemble classifiers = self._get_best_classifiers(ensemble, neighbors_X, neighbors_y, x) pool = Ensemble(classifiers=classifiers) # KNORA-ELIMINATE-W that supposedly uses weights, does not make # any sense, so even if self.weighted is True, always return # None for the weights return pool, None
def fit(self, X, y): self.ensemble = Ensemble() for _ in range(self.n_classifiers): # bootstrap idx = np.random.choice(X.shape[0], X.shape[0], replace=True) data, target = X[idx, :], y[idx] classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) return
def select(self, ensemble, x): neighbors_X, neighbors_y = self.get_neighbors(x) pool_output = ensemble.output(neighbors_X, mode='labels') output_mask = (pool_output == neighbors_y[:, np.newaxis]) [selected_idx] = np.where(np.any(output_mask, axis=0)) if selected_idx.size > 0: if self.weighted: weights = 1.0 / \ (np.sqrt(np.sum((x - neighbors_X)**2, axis=1)) + 10e-8) weighted_votes = np.dot(weights, output_mask[:, selected_idx]) else: weighted_votes = np.sum(output_mask[:, selected_idx], axis=0) pool = Ensemble( classifiers=[ensemble.classifiers[i] for i in selected_idx]) # if no classifiers are selected, # use all classifiers with no weights else: pool = ensemble weighted_votes = None return pool, weighted_votes
def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote'): self.base_classifier = base_classifier self.n_classifiers = n_classifiers # using the sklearn implementation of bagging for now self.sk_bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=n_classifiers, max_samples=1.0, max_features=1.0) self.ensemble = Ensemble() self.combiner = Combiner(rule=combination_rule)
def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice( X.shape[1], int(np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return
def fit(self, X, y): # if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] # d[score] = indexes of the classifiers with that score d = {} scores = [clf.score(X, y) for clf in ensemble.classifiers] for i, scr in enumerate(scores): d[scr] = d[scr] + [i] if scr in d else [i] best_scores = sorted([k for k in list(d.keys())], reverse=True) # if there was a single best classifier, return it if len(d[best_scores[0]]) == 1: i = d[best_scores[0]][0] return Ensemble([classifiers[i]]), None options = None for j, score in enumerate(best_scores): pred = [classifiers[index].predict(x) for index in d[score]] pred = np.asarray(pred).flatten() bincount = np.bincount(pred.astype(int)) if options is not None: for i in range(len(bincount)): bincount[i] = bincount[i] if i in options else 0 imx = np.argmax(bincount) votes = np.argwhere(bincount == bincount[imx]).flatten() count = len(votes) if count == 1: return Ensemble([classifiers[np.argmax(pred == imx)]]), None elif options is None: options = votes return Ensemble([classifiers[np.argmax(scores)]]), None
def select(self, ensemble, x): if ensemble.in_agreement(x): return Ensemble([ensemble.classifiers[0]]), None n_sel_1, n_sel_2 = self.n_1, self.n_2 if isinstance(self.n_1, float): n_sel_1 = int(n_sel_1 * len(ensemble)) if isinstance(self.n_2, float): n_sel_2 = int(n_sel_2 * len(ensemble)) n_sel_1 = max(n_sel_1, 1) n_sel_2 = max(n_sel_2, 1) # intialize variables # the the indexes of the KNN of x classifiers = ensemble.classifiers [idx] = self.knn.kneighbors(x, return_distance=False) X, y = self.Xval[idx], self.yval[idx] acc_scores = np.array([clf.score(X, y) for clf in classifiers]) out = ensemble.output(X, mode='labels') oracle = np.equal(out, y[:, np.newaxis]) div_scores = np.zeros(len(ensemble), dtype=float) for i in range(len(ensemble)): tmp = [] for j in range(len(ensemble)): if i != j: d = kuncheva_double_fault_measure(oracle[:, [i, j]]) tmp.append(d) div_scores[i] = np.mean(tmp) z = zip(np.arange(len(ensemble)), acc_scores, div_scores) z = sorted(z, key=lambda e: e[1], reverse=True)[:n_sel_1] z = sorted(z, key=lambda e: e[2], reverse=False)[:n_sel_2] z = zip(*z)[0] classifiers = [classifiers[i] for i in z] return Ensemble(classifiers=classifiers), None
class RandomSubspace(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', max_features=0.5): self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combiner = Combiner(rule=combination_rule) self.classifiers = None self.ensemble = None self.max_features = max_features def fit(self, X, y): self.ensemble = Ensemble() for i in range(self.n_classifiers): chosen_features = np.random.choice( X.shape[1], int(np.ceil(X.shape[1] * self.max_features)), replace=False) transformer = FeatureSubsamplingTransformer( features=chosen_features) classifier = BrewClassifier(classifier=sklearn.base.clone( self.base_classifier), transformer=transformer) classifier.fit(X, y) self.ensemble.add(classifier) return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
def select(self, ensemble, x): selected_classifier = None nn_X, nn_y, dists = self.get_neighbors(x, return_distance=True) idx_selected, prob_selected = [], [] all_probs = np.zeros(len(ensemble)) for idx, clf in enumerate(ensemble.classifiers): prob = self.probabilities(clf, nn_X, nn_y, dists, x) if prob > 0.5: idx_selected = idx_selected + [idx] prob_selected = prob_selected + [prob] all_probs[idx] = prob if len(prob_selected) == 0: prob_selected = [np.max(all_probs)] idx_selected = [np.argmax(all_probs)] p_correct_m = max(prob_selected) m = np.argmax(prob_selected) selected = True diffs = [] for j, p_correct_j in enumerate(prob_selected): d = p_correct_m - p_correct_j diffs.append(d) if j != m and d < self.threshold: selected = False if selected: selected_classifier = ensemble.classifiers[idx_selected[m]] else: idx_selected = np.asarray(idx_selected) mask = np.array(np.array(diffs) < self.threshold, dtype=bool) i = np.random.choice(idx_selected[mask]) selected_classifier = ensemble.classifiers[i] return Ensemble([selected_classifier]), None
class SmoteBaggingNew(SmoteBagging): def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def smote_bootstrap_sample(self, X, y, b, k): count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, majority_class * np.ones((majority_count,)))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res,)) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn,)) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, minority_class * np.ones((new_class_data.shape[0],)))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate( (target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa return data, target
class ICSBagging(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.combination_rule = combination_rule self.positive_label = positive_label self.classifiers = None self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity_metric = diversity_metric self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' # TODO normalize diversity metric. ''' self.ensemble.add(classifier) out = self.ensemble.output(self.validation_X) y_pred = self.combiner.combine(out) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, self.validation_y) # diversity = entropy_measure_e(self.ensemble, # self.validation_X, self.validation_y) self.ensemble.classifiers.pop() return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.combiner.combine(self.ensemble.output(self.validation_X)) mask = self.positive_label == self.validation_y pos_acc = float(sum(y_pred[mask] == self.validation_y[mask])) / len( self.validation_y[mask]) neg_acc = float(sum(y_pred[~mask] == self.validation_y[~mask])) / len( self.validation_y[~mask]) return 1.0 - (pos_acc / (pos_acc + neg_acc)) def bootstrap_classifiers(self, X, y, K, pos_prob): mask = self.positive_label == y negative_label = y[~mask][0] clfs = [] sets_cX, sets_cy = [], [] for i in range(K): cX, cy = [], [] for j in range(X.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X[mask]) - 1) cX = cX + [X[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X[~mask]) - 1) cX = cX + [X[~mask][idx]] cy = cy + [negative_label] if self.positive_label not in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[mask]) - 1) cX[idx_1] = X[mask][idx_2] cy[idx_1] = self.positive_label elif negative_label not in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X[~mask]) - 1) cX[idx_1] = X[~mask][idx_2] cy[idx_1] = negative_label # print len(cX), len(cy), X.shape[0], len(X), np.bincount(cy) sets_cX, sets_cy = sets_cX + [cX], sets_cy + [cy] clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs def fit(self, X, y): # if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for _ in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class ICSBaggingNew(PoolGenerator): def __init__(self, K=10, alpha=0.75, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', diversity_metric='e', positive_label=1): self.K = K self.alpha = alpha self.base_classifier = base_classifier self.n_classifiers = n_classifiers self.positive_label = positive_label self.ensemble = None self.combiner = Combiner(rule=combination_rule) self.diversity = Diversity(metric=diversity_metric) self.validation_X = None self.validation_y = None def set_validation(self, X, y): self.validation_X = X self.validation_y = y def fitness(self, classifier): ''' # TODO normalize diversity metric. ''' self.ensemble.add(classifier) y_pred = self.predict(self.validation_X) y_true = self.validation_y auc = evaluation.auc_score(y_true, y_pred) div = self.diversity.calculate(self.ensemble, self.validation_X, y_true) self.ensemble.classifiers.pop() # create interface for this later return self.alpha * auc + (1.0 - self.alpha) * div def _calc_pos_prob(self): y_pred = self.predict(self.validation_X) y_true = self.validation_y # obtaining recall scores for each label (assuming the labels are # binary) pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label) neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label)) return neg_acc / (pos_acc + neg_acc) def bootstrap_classifiers(self, X, y, K, pos_prob): pos_idx = (y == self.positive_label) neg_idx = (y == int(not self.positive_label)) X_pos, _ = X[pos_idx, :], y[pos_idx] # positive examples X_neg, _ = X[neg_idx, :], y[neg_idx] # negative examples classifiers = [] for i in range(K): X_new = np.zeros(X.shape) y_new = np.zeros(y.shape) for j in range(X.shape[0]): if pos_prob > np.random.random(): # add a randomly chosen positive example idx = np.random.randint(X_pos.shape[0]) X_new[j, :] = X_pos[idx, :] y_new[j] = self.positive_label else: # add a randomly chosen negative example idx = np.random.randint(X_neg.shape[0]) X_new[j, :] = X_neg[idx, :] y_new[j] = int(not self.positive_label) # if no positive example is present, make sure you insert at least # one if not np.any(y_new == self.positive_label): # chosen spot for replacement on new array idx_new = np.random.randint(X_new.shape[0]) # chosen positive example index idx_pos = np.random.randint(X_pos.shape[0]) X_new[idx_new, :] = X_pos[idx_pos, :] y_new[idx_new] = self.positive_label # if no negative example is present, make sure you insert at least # one elif not np.any(y_new == int(not self.positive_label)): # chosen spot for replacement on new array idx_new = np.random.randint(X_new.shape[0]) # chosen positive example index idx_neg = np.random.randint(X_neg.shape[0]) X_new[idx_new, :] = X_neg[idx_neg, :] y_new[idx_new] = int(not self.positive_label) # train classifier with the bootstrapped data clf = sklearn.base.clone(self.base_classifier) clf.fit(X_new, y_new) classifiers.append(clf) return classifiers def fit(self, X, y): # if self.validation_X == None and self.validation_y == None: self.validation_X = X self.validation_y = y self.classes_ = set(y) self.ensemble = Ensemble() clfs = self.bootstrap_classifiers(X, y, self.K, 0.5) self.ensemble.add(np.random.choice(clfs)) for i in range(1, self.n_classifiers): clfs = self.bootstrap_classifiers(X, y, self.K, self._calc_pos_prob()) self.ensemble.add(max(clfs, key=lambda clf: self.fitness(clf))) self.validation_X = None self.validation_y = None return self def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)
class SmoteBagging(PoolGenerator): def __init__(self, base_classifier=None, n_classifiers=100, combination_rule='majority_vote', k=5): # self.b = b self.k = k self.n_classifiers = n_classifiers self.base_classifier = base_classifier self.ensemble = None self.combiner = Combiner(rule=combination_rule) def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y == i), :] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, i * np.ones((majority_count,)))) # print('original class data = {}'.format(class_data.shape)) # print('sampled class data = {}'.format(class_data[idx,:].shape)) # noqa # print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b / 100) idx = np.random.choice( class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) # noqa sampled_class_data = class_data[idx, :] # print('original class data = {}'.format(class_data.shape)) # print('majority_count = {}'.format(majority_count)) # print('class data = {}'.format(class_data.shape)) # print('b = {}'.format(b)) # print('sample rate = {}'.format(sample_rate)) # print('sampled class data = {}'.format(sampled_class_data.shape)) # noqa # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small # value to avoid a zero N_smote = int(np.ceil( (majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100) # noqa # print(N_smote) # print('----------') # print('smote parameters:') # print('T : {}'.format(sampled_class_data.shape)) # print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) # print('synthetic data = {})'.format(synthetic.shape)) # print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate( (sampled_class_data, synthetic[idx, :])) # print('new class data = {})'.format(new_class_data.shape)) # print() data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, i * np.ones((new_class_data.shape[0],)))) return data, target def fit(self, X, y): self.ensemble = Ensemble() # this parameter should change between [10, 100] with # increments of 10, for every classifier in the ensemble b = 10 for i in range(self.n_classifiers): # print() # print('classifier : {}'.format(i)) # print('------------------------') # print('b = {}'.format(b)) data, target = self.smote_bootstrap_sample( X, y, b=float(b), k=self.k) # print('data = {}'.format(data.shape)) # print() classifier = sklearn.base.clone(self.base_classifier) classifier.fit(data, target) self.ensemble.add(classifier) if b >= 100: b = 10 else: b += 10 return def predict(self, X): out = self.ensemble.output(X) return self.combiner.combine(out)