Пример #1
0
def vc():
    # gbc_test(load_new=True)
    #gbc_test(datatable=second_layer(datatable=first_layer()))
    from sklearn.ensemble import VotingClassifier
    from sklearn.preprocessing import LabelEncoder
    print('starting')
    main = pd.read_csv('data/submission/test-worked.csv',
                       header=0,
                       low_memory=False)
    main.fillna(0, axis=1, inplace=True)

    with open('gbc_training_i36.pkl', 'rb') as f:
        gbc_36 = pickle.load(f)
    estimators = ('gbc_36', gbc_36)

    vc = VotingClassifier(estimators, voting='soft', n_jobs=-1)
    vc.estimators_ = [gbc_36]

    X, y = main.drop(['click_id', 'click_time'], axis=1), main['click_id']

    print('split')
    #vc.fit(X_train, y_train)

    y = pd.DataFrame(y)
    print('predicting')
    y['is_attributed'] = gbc_36.predict(X)

    y.to_csv('data/submission/submission-test.csv')
    return y
Пример #2
0
def createVotingClassifier(n_trees,
                           X,
                           y,
                           depth,
                           min_saples=2,
                           max_feat=0.2,
                           overhead=2.0,
                           voting_='soft'):
    N_data = int(overhead * len(X) / n_trees)
    print(str(N_data) + ' will be used by classifier')
    estimators_ = []
    estimators = []
    for i in range(n_trees):
        clf = RandomForestClassifier(max_depth=depth,
                                     min_samples_leaf=min_saples,
                                     max_features=max_feat)
        if (i + 1) * N_data < len(X):
            clf.fit(X[i * N_data:(i + 1) * N_data],
                    y[i * N_data:(i + 1) * N_data])
        else:
            X, y = shuffle(X, y)
            clf.fit(X[:N_data], y[:N_data])
        estimators_.append((str(i), clf))
        estimators.append(clf)
    tmp = VotingClassifier(estimators=estimators_, voting=voting_)
    tmp.estimators_ = estimators
    return tmp
Пример #3
0
 def voting_classifier(X, y):
     classifiers = [
         MyDummyClassifier(config=1, random_state=0).fit(X, y)
         for _ in range(5)
     ]
     vc = VotingClassifier(estimators=None, voting='soft')
     vc.estimators_ = classifiers
     return vc
Пример #4
0
def make_voter(estimators, y, voting='hard'):
    estimators = list(estimators.items())
    clf = VotingClassifier(estimators, voting)
    clf.estimators_ = [estim for name, estim in estimators]
    clf.le_ = LabelEncoder()
    clf.le_.fit(y)
    clf.classes_ = clf.le_.classes_
    return clf
Пример #5
0
    def _oos_eval(self, clfs, func, meta=False, *args, **kwargs):
        # If we're in the meta case, just call this several times regularly
        if meta:
            oos = []
            # Jackknife for proportionally fewer cases in meta eval
            for _ in range(int(np.ceil(self.n_jack*self.n_oos))):
                tmpclf, tmpoos = self._oos_eval(clfs, func, meta=False,
                                                *args, **kwargs)
                clf = tmpclf
                oos += [tmpoos]
                del tmpoos
            return clf, oos

        # Generate test / oos data
        oos = {}
        Xo, yo, grpo = self._prep_data(self.dat_t, self.tar_t, self.sam_t,
                                       func, *args, **kwargs)

        # Aggregate classifiers across folds and pre-load training
        clf = VotingClassifier(voting='soft',
                               estimators=[(i, c) for i, c in enumerate(clfs)])
        clf.estimators_ = clfs
        clf.le_ = LabelEncoder().fit(yo)
        clf.classes_ = clf.le_.classes_

        # Evaluate voting classifier on test data
        pred = clf.predict(Xo)
        oos['true'] = yo
        oos['pred'] = pred
        oos['acc'] = accuracy_score(yo, pred)
        oos['f1'] = f1_score(yo, pred)
        # Compare to mean oos-performance of component classifiers
        comp_preds = [c.predict(Xo) for c in clfs]
        oos['comp_acc'] = np.mean([accuracy_score(yo, cp) for cp in comp_preds])
        oos['comp_f1'] = np.mean([f1_score(yo, cp) for cp in comp_preds])

        f1p, accp = self.performanceP(yo, oos['f1'], oos['acc'])
        oos['p_f1'] = f1p
        oos['p_acc'] = accp
        # Print performance
        if self.verbose:
            print("Y: ", pred, "->", yo)
            print("G: ", grpo)
            print("Test Accuracy: {0} (p <= {1})".format(oos['acc'], accp))
            print("Test F1: {0} (p<= {1})".format(oos['f1'], f1p))

        return clf, oos
def createVotingClassifier(n_trees,X,y,depth,min_saples=2,max_feat=0.2,overhead=2.0,voting_='soft'):
    N_data = int(overhead*len(X)/n_trees)
    print(str(N_data)+' will be used by classifier')
    estimators_ = []
    estimators = []
    for i in range(n_trees):
        clf = RandomForestClassifier(max_depth=depth,min_samples_leaf=min_saples,max_features=max_feat)
	if (i+1)*N_data<len(X):
        	clf.fit(X[i*N_data:(i+1)*N_data],y[i*N_data:(i+1)*N_data])
	else:
		X,y = shuffle(X,y)
		clf.fit(X[:N_data],y[:N_data])
        estimators_.append((str(i),clf))
        estimators.append(clf)
    tmp = VotingClassifier(estimators=estimators_, voting=voting_)
    tmp.estimators_ = estimators
    return tmp
Пример #7
0
 def fit_voting(self):
     voting = 'soft'
     names = [
         # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,'
         # 'elongated,negation_count)',
         # 'logreg(w2v_doc)',
         # 'logreg(w2v_word_avg_google)',
         'word2vec_bayes',
         'cnn_word(embedding=google)',
         'rnn_word(embedding=google)',
     ]
     classifiers = [ExternalModel({
         self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)),
         self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)),
     }) for name in names]
     all_scores = []
     for classifier in classifiers:
         scores = classifier.predict_proba(self.val_docs)
         if voting == 'hard':
             scores = Binarizer(1 / 3).transform(scores)
         all_scores.append(scores)
     all_scores = np.array(all_scores)
     all_scores_first, all_scores_rest = all_scores[0], all_scores[1:]
     le = LabelEncoder().fit(self.classes_)
     val_label_indexes = le.transform(self.val_labels())
     # assume w_0=1 as w is invariant to scaling
     w = basinhopping(
         lambda w_: -(val_label_indexes == np.argmax((
             all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1))
         ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000,
         minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1))
     ).x
     w = np.hstack([[1], w])
     w /= w.sum()
     logging.info('w: {}'.format(w))
     estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w)
     estimator.le_ = le
     estimator.estimators_ = classifiers
     return 'vote({})'.format(','.join(names)), estimator
    def file_output(self, Y_optimization_pred, Y_valid_pred, Y_test_pred):
        # Abort if self.Y_optimization is None
        # self.Y_optimization can be None if we use partial-cv, then,
        # obviously no output should be saved.
        if self.Y_optimization is None:
            return None, {}

        # Abort in case of shape misalignment
        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return (
                1.0,
                {
                    'error':
                    "Targets %s and prediction %s don't have "
                    "the same length. Probably training didn't "
                    "finish" %
                    (self.Y_optimization.shape, Y_optimization_pred.shape)
                },
            )

        # Abort if predictions contain NaNs
        for y, s in [
                # Y_train_pred deleted here. Fix unittest accordingly.
            [Y_optimization_pred, 'optimization'],
            [Y_valid_pred, 'validation'],
            [Y_test_pred, 'test']
        ]:
            if y is not None and not np.all(np.isfinite(y)):
                return (
                    1.0,
                    {
                        'error':
                        'Model predictions for %s set contains NaNs.' % s
                    },
                )

        # Abort if we don't want to output anything.
        # Since disable_file_output can also be a list, we have to explicitly
        # compare it with True.
        if self.disable_file_output is True:
            return None, {}

        # Notice that disable_file_output==False and disable_file_output==[]
        # means the same thing here.
        if self.disable_file_output is False:
            self.disable_file_output = []

        # This file can be written independently of the others down bellow
        if ('y_optimization' not in self.disable_file_output):
            if self.output_y_hat_optimization:
                self.backend.save_targets_ensemble(self.Y_optimization)

        if hasattr(self, 'models') and len(
                self.models) > 0 and self.models[0] is not None:
            if ('models' not in self.disable_file_output):

                if self.task_type in CLASSIFICATION_TASKS:
                    models = VotingClassifier(
                        estimators=None,
                        voting='soft',
                    )
                else:
                    models = VotingRegressor(estimators=None)
                models.estimators_ = self.models
            else:
                models = None
        else:
            models = None

        self.backend.save_numrun_to_dir(
            seed=self.seed,
            idx=self.num_run,
            budget=self.budget,
            model=self.model
            if 'model' not in self.disable_file_output else None,
            cv_model=models
            if 'cv_model' not in self.disable_file_output else None,
            ensemble_predictions=(Y_optimization_pred if 'y_optimization'
                                  not in self.disable_file_output else None),
            valid_predictions=(Y_valid_pred if 'y_valid'
                               not in self.disable_file_output else None),
            test_predictions=(Y_test_pred if 'y_test'
                              not in self.disable_file_output else None),
        )

        return None, {}
Пример #9
0
    def file_output(self, Y_optimization_pred, Y_valid_pred, Y_test_pred):
        # Abort if self.Y_optimization is None
        # self.Y_optimization can be None if we use partial-cv, then,
        # obviously no output should be saved.
        if self.Y_optimization is None:
            return None, {}

        # Abort in case of shape misalignment
        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return (
                1.0,
                {
                    'error':
                    "Targets %s and prediction %s don't have "
                    "the same length. Probably training didn't "
                    "finish" %
                    (self.Y_optimization.shape, Y_optimization_pred.shape)
                },
            )

        # Abort if predictions contain NaNs
        for y, s in [
                # Y_train_pred deleted here. Fix unittest accordingly.
            [Y_optimization_pred, 'optimization'],
            [Y_valid_pred, 'validation'],
            [Y_test_pred, 'test']
        ]:
            if y is not None and not np.all(np.isfinite(y)):
                return (
                    1.0,
                    {
                        'error':
                        'Model predictions for %s set contains NaNs.' % s
                    },
                )

        # Abort if we don't want to output anything.
        # Since disable_file_output can also be a list, we have to explicitly
        # compare it with True.
        if self.disable_file_output is True:
            return None, {}

        # Notice that disable_file_output==False and disable_file_output==[]
        # means the same thing here.
        if self.disable_file_output is False:
            self.disable_file_output = []

        # This file can be written independently of the others down bellow
        if ('y_optimization' not in self.disable_file_output):
            if self.output_y_hat_optimization:
                try:
                    os.makedirs(self.backend.output_directory)
                except OSError:
                    pass
                self.backend.save_targets_ensemble(self.Y_optimization)

        # The other four files have to be written together, meaning we start
        # writing them just after acquiring the locks for all of them.
        # But first we have to check which files have to be written.
        write_tasks = []

        # File 1 of 5: model
        if ('model' not in self.disable_file_output):
            if os.path.exists(self.backend.get_model_dir()):
                file_path = self.backend.get_model_path(
                    self.seed, self.num_run, self.budget)
                write_tasks.append(
                    WriteTask(lock=lockfile.LockFile(file_path),
                              writer=self.backend.save_model,
                              args=(self.model, file_path)))

        # File 2 of 5: predictions
        if ('y_optimization' not in self.disable_file_output):
            file_path = self.backend.get_prediction_output_path(
                'ensemble', self.seed, self.num_run, self.budget)
            write_tasks.append(
                WriteTask(lock=lockfile.LockFile(file_path),
                          writer=self.backend.save_predictions_as_npy,
                          args=(Y_optimization_pred, file_path)))

        # File 3 of 5: validation predictions
        if Y_valid_pred is not None:
            file_path = self.backend.get_prediction_output_path(
                'valid', self.seed, self.num_run, self.budget)
            write_tasks.append(
                WriteTask(lock=lockfile.LockFile(file_path),
                          writer=self.backend.save_predictions_as_npy,
                          args=(Y_valid_pred, file_path)))

        # File 4 of 5: test predictions
        if Y_test_pred is not None:
            file_path = self.backend.get_prediction_output_path(
                'test', self.seed, self.num_run, self.budget)
            write_tasks.append(
                WriteTask(lock=lockfile.LockFile(file_path),
                          writer=self.backend.save_predictions_as_npy,
                          args=(Y_test_pred, file_path)))

        # File 5 of 5: ensemble of models in case of cross-validation
        if hasattr(self, 'models') and len(
                self.models) > 0 and self.models[0] is not None:
            if ('models' not in self.disable_file_output):

                if self.task_type in CLASSIFICATION_TASKS:
                    models = VotingClassifier(
                        estimators=None,
                        voting='soft',
                    )
                else:
                    models = VotingRegressor(estimators=None)
                models.estimators_ = self.models

                if os.path.exists(self.backend.get_cv_model_dir()):
                    file_path = self.backend.get_cv_model_path(
                        self.seed, self.num_run, self.budget)
                    write_tasks.append(
                        WriteTask(lock=lockfile.LockFile(file_path),
                                  writer=self.backend.save_model,
                                  args=(models, file_path)))

        # We then acquire the locks one by one in a stubborn fashion, i.e. if a file is
        # already locked, we keep probing it until it is unlocked. This will NOT create a
        # race condition with _delete_non_candidate_models() since this function doesn't
        # acquire the locks in this stubborn way. The delete function releases all the
        # locks and aborts the acquision process as soon as it finds a locked file.
        for wt in write_tasks:
            while True:
                try:
                    wt.lock.acquire()
                    break
                except lockfile.AlreadyLocked:
                    time.sleep(.1)
                    continue
                except Exception as e:
                    raise RuntimeError('Failed to lock %s due to %s' %
                                       (wt.lock, e))

        # At this point we are good to write the files
        for wt in write_tasks:
            wt.writer(*wt.args)

        # And finally release the locks
        for wt in write_tasks:
            wt.lock.release()

        return None, {}
Пример #10
0
    def file_output(self, Y_optimization_pred: np.ndarray,
                    Y_valid_pred: np.ndarray,
                    Y_test_pred: np.ndarray) -> Tuple[Optional[float], Dict]:
        """
        This method decides what file outputs are written to disk.

        It is also the interface to the backed save_numrun_to_dir
        which stores all the pipeline related information to a single
        directory for easy identification of the current run.

        Args:
            Y_optimization_pred (np.ndarray):
                The pipeline predictions on the validation set internally created
                from self.y_train
            Y_valid_pred (np.ndarray):
                The pipeline predictions on the user provided validation set,
                which should match self.y_valid
            Y_test_pred (np.ndarray):
                The pipeline predictions on the user provided test set,
                which should match self.y_test
        Returns:
            loss (Optional[float]):
                A loss in case the run failed to store files to
                disk
            error_dict (Dict):
                A dictionary with an error that explains why a run
                was not successfully stored to disk.
        """
        # Abort if self.Y_optimization is None
        # self.Y_optimization can be None if we use partial-cv, then,
        # obviously no output should be saved.
        if self.Y_optimization is None:
            return None, {}

        # Abort in case of shape misalignment
        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return (
                1.0,
                {
                    'error':
                    "Targets %s and prediction %s don't have "
                    "the same length. Probably training didn't "
                    "finish" %
                    (self.Y_optimization.shape, Y_optimization_pred.shape)
                },
            )

        # Abort if predictions contain NaNs
        for y, s in [
                # Y_train_pred deleted here. Fix unittest accordingly.
            [Y_optimization_pred, 'optimization'],
            [Y_valid_pred, 'validation'],
            [Y_test_pred, 'test']
        ]:
            if y is not None and not np.all(np.isfinite(y)):
                return (
                    1.0,
                    {
                        'error':
                        'Model predictions for %s set contains NaNs.' % s
                    },
                )

        # Abort if we don't want to output anything.
        if hasattr(self, 'disable_file_output'):
            if self.disable_file_output:
                return None, {}
            else:
                self.disabled_file_outputs = []

        # This file can be written independently of the others down bellow
        if 'y_optimization' not in self.disabled_file_outputs:
            if self.output_y_hat_optimization:
                self.backend.save_targets_ensemble(self.Y_optimization)

        if hasattr(self, 'pipelines') and self.pipelines is not None:
            if self.pipelines[0] is not None and len(self.pipelines) > 0:
                if 'pipelines' not in self.disabled_file_outputs:
                    if self.task_type in CLASSIFICATION_TASKS:
                        pipelines = VotingClassifier(
                            estimators=None,
                            voting='soft',
                        )
                    else:
                        pipelines = VotingRegressorWrapper(estimators=None)
                    pipelines.estimators_ = self.pipelines
                else:
                    pipelines = None
            else:
                pipelines = None
        else:
            pipelines = None

        if hasattr(self, 'pipeline') and self.pipeline is not None:
            if 'pipeline' not in self.disabled_file_outputs:
                pipeline = self.pipeline
            else:
                pipeline = None
        else:
            pipeline = None

        self.logger.debug("Saving directory {}, {}, {}".format(
            self.seed, self.num_run, self.budget))
        self.backend.save_numrun_to_dir(
            seed=int(self.seed),
            idx=int(self.num_run),
            budget=float(self.budget),
            model=pipeline,
            cv_model=pipelines,
            ensemble_predictions=(Y_optimization_pred if 'y_optimization'
                                  not in self.disabled_file_outputs else None),
            valid_predictions=(Y_valid_pred if 'y_valid'
                               not in self.disabled_file_outputs else None),
            test_predictions=(Y_test_pred if 'y_test'
                              not in self.disabled_file_outputs else None),
        )

        return None, {}
Пример #11
0
    def fit(self):
        clf_list=[]
        # # KNN
        # print "KNN"
        # knn = KNeighborsClassifier(n_neighbors=35, weights='distance', leaf_size=2)
        # print "Fitting KNN"
        # knn.fit(self.X_train, self.y_train)
        # print('KNN {score}'.format(score=log_loss(self.y_test, knn.predict_proba(self.X_test))))
        # self.clfs['knn'] = knn
        # clf_list.append(knn)
        # Random forests
        print "Random forest on gini"
        rfc = RandomForestClassifier(n_estimators=43,
                                     criterion='gini',
                                     random_state=4141,
                                     n_jobs=-1,
                                     max_depth=21,
                                     max_features=0.12)
        print "Fitting random forest with gini"
        rfc.fit(self.X_train, self.y_train)
        print('RFC LogLoss {score}'.format(score=log_loss(self.y_test, rfc.predict_proba(self.X_test))))
        self.clfs['rfc']=rfc
        clf_list.append(rfc)
        print "Random forest with entropy"
        rfc2 = RandomForestClassifier(n_estimators=80,
                                      criterion='entropy',
                                      random_state=1337,
                                      n_jobs=-1,
                                      max_depth=36,
                                      max_features=0.06)
        print "Fitting random forest with entropy"
        rfc2.fit(self.X_train, self.y_train)
        print('RFC2 LogLoss {score}'.format(score=log_loss(self.y_test, rfc2.predict_proba(self.X_test))))
        self.clfs['rfc2']=rfc2
        clf_list.append(rfc2)
        # Logistic regression
        print "Logistic regression on logloss"
        logreg = LogisticRegression(C=1.05, penalty='l2')
        print "Fitting logistic regression"
        logreg.fit(self.X_train, self.y_train)
        print('LR LogLoss {score}'.format(score=log_loss(self.y_test, logreg.predict_proba(self.X_test))))
        self.clfs['lr']=logreg
        clf_list.append(logreg)

        # # gradient boosting
        # gbt1=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth = 1, random_state = 0)
        # print "Fitting gradient boosting tree"
        # gbt1.fit(self.X_train, self.y_train)
        # print('Gbt1 LogLoss {score}'.format(score=log_loss(self.y_test, gbt1.predict_proba(self.X_test))))
        # self.clfs['gbt1']=gbt1
        # clf_list.append(gbt1)

        # # Bad performance
        # # Multinomial Naive Bayes
        # print "Multinomial naive bayes"
        # mnb = MultinomialNB(fit_prior=False,alpha=0.25)
        # print "Fitting multinomial naive bayes"
        # mnb.fit(self.X_train, self.y_train)
        # print('MNB {score}'.format(score=log_loss(self.y_test, mnb.predict_proba(self.X_test))))
        # self.clfs['mnb'] = mnb
        # clf_list.append(mnb)

        # Adaboost
        print "Adaboost trees"
        abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.5)
        print "Fitting Adaboost trees"
        abc.fit(self.X_train, self.y_train)
        print('ABC {score}'.format(score=log_loss(self.y_test, abc.predict_proba(self.X_test))))
        self.clfs['abc'] = abc
        clf_list.append(abc)


        # Ensemble to models
        eclf3 = VotingClassifier(estimators=[('lr', logreg), ('rf', rfc), ('rf2', rfc2),('abc',abc)], voting='soft',
                                 weights=[2, 2, 2, 1])
        eclf3.estimators_ = clf_list
        print "Dig into the voting classifier"
        innerClfs = eclf3.estimators_
        print "Check estimators"
        print innerClfs
        print('Ensemble LogLoss {score}'.format(score=log_loss(self.y_test, eclf3.predict_proba(self.X_test))))
        self.ensembleClf=eclf3
        print "Ensemble fitting finished"