def vc(): # gbc_test(load_new=True) #gbc_test(datatable=second_layer(datatable=first_layer())) from sklearn.ensemble import VotingClassifier from sklearn.preprocessing import LabelEncoder print('starting') main = pd.read_csv('data/submission/test-worked.csv', header=0, low_memory=False) main.fillna(0, axis=1, inplace=True) with open('gbc_training_i36.pkl', 'rb') as f: gbc_36 = pickle.load(f) estimators = ('gbc_36', gbc_36) vc = VotingClassifier(estimators, voting='soft', n_jobs=-1) vc.estimators_ = [gbc_36] X, y = main.drop(['click_id', 'click_time'], axis=1), main['click_id'] print('split') #vc.fit(X_train, y_train) y = pd.DataFrame(y) print('predicting') y['is_attributed'] = gbc_36.predict(X) y.to_csv('data/submission/submission-test.csv') return y
def createVotingClassifier(n_trees, X, y, depth, min_saples=2, max_feat=0.2, overhead=2.0, voting_='soft'): N_data = int(overhead * len(X) / n_trees) print(str(N_data) + ' will be used by classifier') estimators_ = [] estimators = [] for i in range(n_trees): clf = RandomForestClassifier(max_depth=depth, min_samples_leaf=min_saples, max_features=max_feat) if (i + 1) * N_data < len(X): clf.fit(X[i * N_data:(i + 1) * N_data], y[i * N_data:(i + 1) * N_data]) else: X, y = shuffle(X, y) clf.fit(X[:N_data], y[:N_data]) estimators_.append((str(i), clf)) estimators.append(clf) tmp = VotingClassifier(estimators=estimators_, voting=voting_) tmp.estimators_ = estimators return tmp
def voting_classifier(X, y): classifiers = [ MyDummyClassifier(config=1, random_state=0).fit(X, y) for _ in range(5) ] vc = VotingClassifier(estimators=None, voting='soft') vc.estimators_ = classifiers return vc
def make_voter(estimators, y, voting='hard'): estimators = list(estimators.items()) clf = VotingClassifier(estimators, voting) clf.estimators_ = [estim for name, estim in estimators] clf.le_ = LabelEncoder() clf.le_.fit(y) clf.classes_ = clf.le_.classes_ return clf
def _oos_eval(self, clfs, func, meta=False, *args, **kwargs): # If we're in the meta case, just call this several times regularly if meta: oos = [] # Jackknife for proportionally fewer cases in meta eval for _ in range(int(np.ceil(self.n_jack*self.n_oos))): tmpclf, tmpoos = self._oos_eval(clfs, func, meta=False, *args, **kwargs) clf = tmpclf oos += [tmpoos] del tmpoos return clf, oos # Generate test / oos data oos = {} Xo, yo, grpo = self._prep_data(self.dat_t, self.tar_t, self.sam_t, func, *args, **kwargs) # Aggregate classifiers across folds and pre-load training clf = VotingClassifier(voting='soft', estimators=[(i, c) for i, c in enumerate(clfs)]) clf.estimators_ = clfs clf.le_ = LabelEncoder().fit(yo) clf.classes_ = clf.le_.classes_ # Evaluate voting classifier on test data pred = clf.predict(Xo) oos['true'] = yo oos['pred'] = pred oos['acc'] = accuracy_score(yo, pred) oos['f1'] = f1_score(yo, pred) # Compare to mean oos-performance of component classifiers comp_preds = [c.predict(Xo) for c in clfs] oos['comp_acc'] = np.mean([accuracy_score(yo, cp) for cp in comp_preds]) oos['comp_f1'] = np.mean([f1_score(yo, cp) for cp in comp_preds]) f1p, accp = self.performanceP(yo, oos['f1'], oos['acc']) oos['p_f1'] = f1p oos['p_acc'] = accp # Print performance if self.verbose: print("Y: ", pred, "->", yo) print("G: ", grpo) print("Test Accuracy: {0} (p <= {1})".format(oos['acc'], accp)) print("Test F1: {0} (p<= {1})".format(oos['f1'], f1p)) return clf, oos
def createVotingClassifier(n_trees,X,y,depth,min_saples=2,max_feat=0.2,overhead=2.0,voting_='soft'): N_data = int(overhead*len(X)/n_trees) print(str(N_data)+' will be used by classifier') estimators_ = [] estimators = [] for i in range(n_trees): clf = RandomForestClassifier(max_depth=depth,min_samples_leaf=min_saples,max_features=max_feat) if (i+1)*N_data<len(X): clf.fit(X[i*N_data:(i+1)*N_data],y[i*N_data:(i+1)*N_data]) else: X,y = shuffle(X,y) clf.fit(X[:N_data],y[:N_data]) estimators_.append((str(i),clf)) estimators.append(clf) tmp = VotingClassifier(estimators=estimators_, voting=voting_) tmp.estimators_ = estimators return tmp
def fit_voting(self): voting = 'soft' names = [ # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,' # 'elongated,negation_count)', # 'logreg(w2v_doc)', # 'logreg(w2v_word_avg_google)', 'word2vec_bayes', 'cnn_word(embedding=google)', 'rnn_word(embedding=google)', ] classifiers = [ExternalModel({ self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)), self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)), }) for name in names] all_scores = [] for classifier in classifiers: scores = classifier.predict_proba(self.val_docs) if voting == 'hard': scores = Binarizer(1 / 3).transform(scores) all_scores.append(scores) all_scores = np.array(all_scores) all_scores_first, all_scores_rest = all_scores[0], all_scores[1:] le = LabelEncoder().fit(self.classes_) val_label_indexes = le.transform(self.val_labels()) # assume w_0=1 as w is invariant to scaling w = basinhopping( lambda w_: -(val_label_indexes == np.argmax(( all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1)) ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000, minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1)) ).x w = np.hstack([[1], w]) w /= w.sum() logging.info('w: {}'.format(w)) estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w) estimator.le_ = le estimator.estimators_ = classifiers return 'vote({})'.format(','.join(names)), estimator
def file_output(self, Y_optimization_pred, Y_valid_pred, Y_test_pred): # Abort if self.Y_optimization is None # self.Y_optimization can be None if we use partial-cv, then, # obviously no output should be saved. if self.Y_optimization is None: return None, {} # Abort in case of shape misalignment if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return ( 1.0, { 'error': "Targets %s and prediction %s don't have " "the same length. Probably training didn't " "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape) }, ) # Abort if predictions contain NaNs for y, s in [ # Y_train_pred deleted here. Fix unittest accordingly. [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], [Y_test_pred, 'test'] ]: if y is not None and not np.all(np.isfinite(y)): return ( 1.0, { 'error': 'Model predictions for %s set contains NaNs.' % s }, ) # Abort if we don't want to output anything. # Since disable_file_output can also be a list, we have to explicitly # compare it with True. if self.disable_file_output is True: return None, {} # Notice that disable_file_output==False and disable_file_output==[] # means the same thing here. if self.disable_file_output is False: self.disable_file_output = [] # This file can be written independently of the others down bellow if ('y_optimization' not in self.disable_file_output): if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) if hasattr(self, 'models') and len( self.models) > 0 and self.models[0] is not None: if ('models' not in self.disable_file_output): if self.task_type in CLASSIFICATION_TASKS: models = VotingClassifier( estimators=None, voting='soft', ) else: models = VotingRegressor(estimators=None) models.estimators_ = self.models else: models = None else: models = None self.backend.save_numrun_to_dir( seed=self.seed, idx=self.num_run, budget=self.budget, model=self.model if 'model' not in self.disable_file_output else None, cv_model=models if 'cv_model' not in self.disable_file_output else None, ensemble_predictions=(Y_optimization_pred if 'y_optimization' not in self.disable_file_output else None), valid_predictions=(Y_valid_pred if 'y_valid' not in self.disable_file_output else None), test_predictions=(Y_test_pred if 'y_test' not in self.disable_file_output else None), ) return None, {}
def file_output(self, Y_optimization_pred, Y_valid_pred, Y_test_pred): # Abort if self.Y_optimization is None # self.Y_optimization can be None if we use partial-cv, then, # obviously no output should be saved. if self.Y_optimization is None: return None, {} # Abort in case of shape misalignment if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return ( 1.0, { 'error': "Targets %s and prediction %s don't have " "the same length. Probably training didn't " "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape) }, ) # Abort if predictions contain NaNs for y, s in [ # Y_train_pred deleted here. Fix unittest accordingly. [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], [Y_test_pred, 'test'] ]: if y is not None and not np.all(np.isfinite(y)): return ( 1.0, { 'error': 'Model predictions for %s set contains NaNs.' % s }, ) # Abort if we don't want to output anything. # Since disable_file_output can also be a list, we have to explicitly # compare it with True. if self.disable_file_output is True: return None, {} # Notice that disable_file_output==False and disable_file_output==[] # means the same thing here. if self.disable_file_output is False: self.disable_file_output = [] # This file can be written independently of the others down bellow if ('y_optimization' not in self.disable_file_output): if self.output_y_hat_optimization: try: os.makedirs(self.backend.output_directory) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) # The other four files have to be written together, meaning we start # writing them just after acquiring the locks for all of them. # But first we have to check which files have to be written. write_tasks = [] # File 1 of 5: model if ('model' not in self.disable_file_output): if os.path.exists(self.backend.get_model_dir()): file_path = self.backend.get_model_path( self.seed, self.num_run, self.budget) write_tasks.append( WriteTask(lock=lockfile.LockFile(file_path), writer=self.backend.save_model, args=(self.model, file_path))) # File 2 of 5: predictions if ('y_optimization' not in self.disable_file_output): file_path = self.backend.get_prediction_output_path( 'ensemble', self.seed, self.num_run, self.budget) write_tasks.append( WriteTask(lock=lockfile.LockFile(file_path), writer=self.backend.save_predictions_as_npy, args=(Y_optimization_pred, file_path))) # File 3 of 5: validation predictions if Y_valid_pred is not None: file_path = self.backend.get_prediction_output_path( 'valid', self.seed, self.num_run, self.budget) write_tasks.append( WriteTask(lock=lockfile.LockFile(file_path), writer=self.backend.save_predictions_as_npy, args=(Y_valid_pred, file_path))) # File 4 of 5: test predictions if Y_test_pred is not None: file_path = self.backend.get_prediction_output_path( 'test', self.seed, self.num_run, self.budget) write_tasks.append( WriteTask(lock=lockfile.LockFile(file_path), writer=self.backend.save_predictions_as_npy, args=(Y_test_pred, file_path))) # File 5 of 5: ensemble of models in case of cross-validation if hasattr(self, 'models') and len( self.models) > 0 and self.models[0] is not None: if ('models' not in self.disable_file_output): if self.task_type in CLASSIFICATION_TASKS: models = VotingClassifier( estimators=None, voting='soft', ) else: models = VotingRegressor(estimators=None) models.estimators_ = self.models if os.path.exists(self.backend.get_cv_model_dir()): file_path = self.backend.get_cv_model_path( self.seed, self.num_run, self.budget) write_tasks.append( WriteTask(lock=lockfile.LockFile(file_path), writer=self.backend.save_model, args=(models, file_path))) # We then acquire the locks one by one in a stubborn fashion, i.e. if a file is # already locked, we keep probing it until it is unlocked. This will NOT create a # race condition with _delete_non_candidate_models() since this function doesn't # acquire the locks in this stubborn way. The delete function releases all the # locks and aborts the acquision process as soon as it finds a locked file. for wt in write_tasks: while True: try: wt.lock.acquire() break except lockfile.AlreadyLocked: time.sleep(.1) continue except Exception as e: raise RuntimeError('Failed to lock %s due to %s' % (wt.lock, e)) # At this point we are good to write the files for wt in write_tasks: wt.writer(*wt.args) # And finally release the locks for wt in write_tasks: wt.lock.release() return None, {}
def file_output(self, Y_optimization_pred: np.ndarray, Y_valid_pred: np.ndarray, Y_test_pred: np.ndarray) -> Tuple[Optional[float], Dict]: """ This method decides what file outputs are written to disk. It is also the interface to the backed save_numrun_to_dir which stores all the pipeline related information to a single directory for easy identification of the current run. Args: Y_optimization_pred (np.ndarray): The pipeline predictions on the validation set internally created from self.y_train Y_valid_pred (np.ndarray): The pipeline predictions on the user provided validation set, which should match self.y_valid Y_test_pred (np.ndarray): The pipeline predictions on the user provided test set, which should match self.y_test Returns: loss (Optional[float]): A loss in case the run failed to store files to disk error_dict (Dict): A dictionary with an error that explains why a run was not successfully stored to disk. """ # Abort if self.Y_optimization is None # self.Y_optimization can be None if we use partial-cv, then, # obviously no output should be saved. if self.Y_optimization is None: return None, {} # Abort in case of shape misalignment if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return ( 1.0, { 'error': "Targets %s and prediction %s don't have " "the same length. Probably training didn't " "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape) }, ) # Abort if predictions contain NaNs for y, s in [ # Y_train_pred deleted here. Fix unittest accordingly. [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], [Y_test_pred, 'test'] ]: if y is not None and not np.all(np.isfinite(y)): return ( 1.0, { 'error': 'Model predictions for %s set contains NaNs.' % s }, ) # Abort if we don't want to output anything. if hasattr(self, 'disable_file_output'): if self.disable_file_output: return None, {} else: self.disabled_file_outputs = [] # This file can be written independently of the others down bellow if 'y_optimization' not in self.disabled_file_outputs: if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) if hasattr(self, 'pipelines') and self.pipelines is not None: if self.pipelines[0] is not None and len(self.pipelines) > 0: if 'pipelines' not in self.disabled_file_outputs: if self.task_type in CLASSIFICATION_TASKS: pipelines = VotingClassifier( estimators=None, voting='soft', ) else: pipelines = VotingRegressorWrapper(estimators=None) pipelines.estimators_ = self.pipelines else: pipelines = None else: pipelines = None else: pipelines = None if hasattr(self, 'pipeline') and self.pipeline is not None: if 'pipeline' not in self.disabled_file_outputs: pipeline = self.pipeline else: pipeline = None else: pipeline = None self.logger.debug("Saving directory {}, {}, {}".format( self.seed, self.num_run, self.budget)) self.backend.save_numrun_to_dir( seed=int(self.seed), idx=int(self.num_run), budget=float(self.budget), model=pipeline, cv_model=pipelines, ensemble_predictions=(Y_optimization_pred if 'y_optimization' not in self.disabled_file_outputs else None), valid_predictions=(Y_valid_pred if 'y_valid' not in self.disabled_file_outputs else None), test_predictions=(Y_test_pred if 'y_test' not in self.disabled_file_outputs else None), ) return None, {}
def fit(self): clf_list=[] # # KNN # print "KNN" # knn = KNeighborsClassifier(n_neighbors=35, weights='distance', leaf_size=2) # print "Fitting KNN" # knn.fit(self.X_train, self.y_train) # print('KNN {score}'.format(score=log_loss(self.y_test, knn.predict_proba(self.X_test)))) # self.clfs['knn'] = knn # clf_list.append(knn) # Random forests print "Random forest on gini" rfc = RandomForestClassifier(n_estimators=43, criterion='gini', random_state=4141, n_jobs=-1, max_depth=21, max_features=0.12) print "Fitting random forest with gini" rfc.fit(self.X_train, self.y_train) print('RFC LogLoss {score}'.format(score=log_loss(self.y_test, rfc.predict_proba(self.X_test)))) self.clfs['rfc']=rfc clf_list.append(rfc) print "Random forest with entropy" rfc2 = RandomForestClassifier(n_estimators=80, criterion='entropy', random_state=1337, n_jobs=-1, max_depth=36, max_features=0.06) print "Fitting random forest with entropy" rfc2.fit(self.X_train, self.y_train) print('RFC2 LogLoss {score}'.format(score=log_loss(self.y_test, rfc2.predict_proba(self.X_test)))) self.clfs['rfc2']=rfc2 clf_list.append(rfc2) # Logistic regression print "Logistic regression on logloss" logreg = LogisticRegression(C=1.05, penalty='l2') print "Fitting logistic regression" logreg.fit(self.X_train, self.y_train) print('LR LogLoss {score}'.format(score=log_loss(self.y_test, logreg.predict_proba(self.X_test)))) self.clfs['lr']=logreg clf_list.append(logreg) # # gradient boosting # gbt1=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth = 1, random_state = 0) # print "Fitting gradient boosting tree" # gbt1.fit(self.X_train, self.y_train) # print('Gbt1 LogLoss {score}'.format(score=log_loss(self.y_test, gbt1.predict_proba(self.X_test)))) # self.clfs['gbt1']=gbt1 # clf_list.append(gbt1) # # Bad performance # # Multinomial Naive Bayes # print "Multinomial naive bayes" # mnb = MultinomialNB(fit_prior=False,alpha=0.25) # print "Fitting multinomial naive bayes" # mnb.fit(self.X_train, self.y_train) # print('MNB {score}'.format(score=log_loss(self.y_test, mnb.predict_proba(self.X_test)))) # self.clfs['mnb'] = mnb # clf_list.append(mnb) # Adaboost print "Adaboost trees" abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.5) print "Fitting Adaboost trees" abc.fit(self.X_train, self.y_train) print('ABC {score}'.format(score=log_loss(self.y_test, abc.predict_proba(self.X_test)))) self.clfs['abc'] = abc clf_list.append(abc) # Ensemble to models eclf3 = VotingClassifier(estimators=[('lr', logreg), ('rf', rfc), ('rf2', rfc2),('abc',abc)], voting='soft', weights=[2, 2, 2, 1]) eclf3.estimators_ = clf_list print "Dig into the voting classifier" innerClfs = eclf3.estimators_ print "Check estimators" print innerClfs print('Ensemble LogLoss {score}'.format(score=log_loss(self.y_test, eclf3.predict_proba(self.X_test)))) self.ensembleClf=eclf3 print "Ensemble fitting finished"