def make_voter(estimators, y, voting='hard'): estimators = list(estimators.items()) clf = VotingClassifier(estimators, voting) clf.estimators_ = [estim for name, estim in estimators] clf.le_ = LabelEncoder() clf.le_.fit(y) clf.classes_ = clf.le_.classes_ return clf
def _oos_eval(self, clfs, func, meta=False, *args, **kwargs): # If we're in the meta case, just call this several times regularly if meta: oos = [] # Jackknife for proportionally fewer cases in meta eval for _ in range(int(np.ceil(self.n_jack*self.n_oos))): tmpclf, tmpoos = self._oos_eval(clfs, func, meta=False, *args, **kwargs) clf = tmpclf oos += [tmpoos] del tmpoos return clf, oos # Generate test / oos data oos = {} Xo, yo, grpo = self._prep_data(self.dat_t, self.tar_t, self.sam_t, func, *args, **kwargs) # Aggregate classifiers across folds and pre-load training clf = VotingClassifier(voting='soft', estimators=[(i, c) for i, c in enumerate(clfs)]) clf.estimators_ = clfs clf.le_ = LabelEncoder().fit(yo) clf.classes_ = clf.le_.classes_ # Evaluate voting classifier on test data pred = clf.predict(Xo) oos['true'] = yo oos['pred'] = pred oos['acc'] = accuracy_score(yo, pred) oos['f1'] = f1_score(yo, pred) # Compare to mean oos-performance of component classifiers comp_preds = [c.predict(Xo) for c in clfs] oos['comp_acc'] = np.mean([accuracy_score(yo, cp) for cp in comp_preds]) oos['comp_f1'] = np.mean([f1_score(yo, cp) for cp in comp_preds]) f1p, accp = self.performanceP(yo, oos['f1'], oos['acc']) oos['p_f1'] = f1p oos['p_acc'] = accp # Print performance if self.verbose: print("Y: ", pred, "->", yo) print("G: ", grpo) print("Test Accuracy: {0} (p <= {1})".format(oos['acc'], accp)) print("Test F1: {0} (p<= {1})".format(oos['f1'], f1p)) return clf, oos
def fit_voting(self): voting = 'soft' names = [ # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,' # 'elongated,negation_count)', # 'logreg(w2v_doc)', # 'logreg(w2v_word_avg_google)', 'word2vec_bayes', 'cnn_word(embedding=google)', 'rnn_word(embedding=google)', ] classifiers = [ExternalModel({ self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)), self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)), }) for name in names] all_scores = [] for classifier in classifiers: scores = classifier.predict_proba(self.val_docs) if voting == 'hard': scores = Binarizer(1 / 3).transform(scores) all_scores.append(scores) all_scores = np.array(all_scores) all_scores_first, all_scores_rest = all_scores[0], all_scores[1:] le = LabelEncoder().fit(self.classes_) val_label_indexes = le.transform(self.val_labels()) # assume w_0=1 as w is invariant to scaling w = basinhopping( lambda w_: -(val_label_indexes == np.argmax(( all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1)) ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000, minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1)) ).x w = np.hstack([[1], w]) w /= w.sum() logging.info('w: {}'.format(w)) estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w) estimator.le_ = le estimator.estimators_ = classifiers return 'vote({})'.format(','.join(names)), estimator