Exemplo n.º 1
0
 def objective(args):
     max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
     clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
                   row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
                   column_subsample=column_subsample, verbose=False)
     score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
     print 'max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss'
     print args, score
     return score
Exemplo n.º 2
0
 def objective(args):
     max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
     clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
                   row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
                   column_subsample=column_subsample, verbose=False)
     score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
     print ('max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss')
     print (args, score)
     return score
Exemplo n.º 3
0
 def objective(args):
     c, gamma = args
     clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma,
                               probability=True, random_state=23))
     score1 = 0
     score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
     score = log_loss(valid_labels, clf.predict_proba(valid))
     print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score)
     return score
Exemplo n.º 4
0
 def objective(args):
     c, gamma = args
     clf = OneVsRestClassifier(
         svm.SVC(C=c,
                 kernel='rbf',
                 tol=.001,
                 gamma=gamma,
                 probability=True,
                 random_state=23))
     score1 = 0
     score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
     score = log_loss(valid_labels, clf.predict_proba(valid))
     print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (
         c, gamma, score1, score2, score)
     return score
Exemplo n.º 5
0
                  row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804,
                  column_subsample=.730128689911957, step_size=.009)

    if MODE == 'cv':
        scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
        print ('CV:', scores, 'Mean log loss:', np.mean(scores))
        utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
    elif MODE == 'submission':
        clf.fit(train, labels)
        predictions = clf.predict_proba(test)
        utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
                              os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
                              predictions)
    elif MODE == 'holdout':
        train, labels, _, _ = utils.stratified_split(train, labels, test_size=.7)
        score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
        print ('Log loss:', score)
    elif MODE == 'tune':
        # Objective function
        def objective(args):
            max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample = args
            clf = XGBoost(max_depth=max_depth, min_child_weight=min_child_weight,
                          row_subsample=row_subsample, min_loss_reduction=min_loss_reduction,
                          column_subsample=column_subsample, verbose=False)
            score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
            print ('max_depth, min_child_weight, row_subsample, min_loss_reduction, column_subsample, logloss')
            print (args, score)
            return score
        # Searching space
        space = (
            hp.quniform('max_depth', 2, 14, 1),
Exemplo n.º 6
0
train, labels, test, _, _ = utils.load_data()

# transform counts to TFIDF features
tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
train = tfidf.fit_transform(train).toarray()
test = tfidf.transform(test).toarray()

# encode labels
lbl_enc = preprocessing.LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# train classifier
clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3,
                                    bootstrap=False, verbose=3, random_state=23)

if MODE == 'cv':
    scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False)
    print('CV:', scores, 'Mean log loss:', np.mean(scores))
    utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
elif MODE == 'submission':
    clf.fit(train, labels)
    predictions = clf.predict_proba(test)
    utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
                          os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
                          predictions)
elif MODE == 'holdout':
    score = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
    print('Log loss:', score)
else:
    print('Unknown mode')
print (train.shape)

# encode labels
lbl_enc = preprocessing.LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# train classifier
linear_clf = linear_model.LogisticRegression(C=1, penalty='l1',
                                             fit_intercept=True, random_state=23)

clf = ensemble.BaggingClassifier(base_estimator=linear_clf, n_estimators=40,
                                 max_samples=1., max_features=1., bootstrap=True,
                                 n_jobs=5, verbose=True, random_state=23)

if MODE == 'cv':
    scores, predictions = utils.make_blender_cv(clf, train, labels)
    print ('CV:', scores, 'Mean log loss:', np.mean(scores))
    utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
elif MODE == 'submission':
    clf.fit(train, labels)
    predictions = clf.predict_proba(test)
    utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
                          os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
                          predictions)
elif MODE == 'holdout':
    score = utils.hold_out_evaluation(clf, train, labels)
    print ('Log loss:', score)
else:
    print ('Unknown mode')
Exemplo n.º 8
0
    print 'CV:', scores, 'Mean log loss:', np.mean(scores)
    utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv',
                             predictions)
elif MODE == 'submission':
    calibrated_classifier = CalibratedClassifierCV(clf,
                                                   method='isotonic',
                                                   cv=utils.get_cv(labels))
    fitted_classifier = calibrated_classifier.fit(train, labels)
    predictions = fitted_classifier.predict_proba(test)
    utils.save_submission(
        consts.DATA_SAMPLE_SUBMISSION_PATH,
        os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions)
elif MODE == 'holdout':
    score = utils.hold_out_evaluation(clf,
                                      train,
                                      labels,
                                      calibrate=False,
                                      test_size=0.9)
    print 'Log loss:', score
elif MODE == 'tune':
    train, labels, valid, valid_labels = utils.stratified_split(train,
                                                                labels,
                                                                test_size=.8)
    from sklearn.metrics import log_loss

    # Objective function
    def objective(args):
        c, gamma = args
        clf = OneVsRestClassifier(
            svm.SVC(C=c,
                    kernel='rbf',
Exemplo n.º 9
0
clf = OneVsRestClassifier(svm.SVC(C=4.919646+2., kernel='rbf', tol=.001,
                                  verbose=True, probability=True, gamma=0.646508+.3, random_state=23))

if MODE == 'cv':
    scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=True)
    print 'CV:', scores, 'Mean log loss:', np.mean(scores)
    utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
elif MODE == 'submission':
    calibrated_classifier = CalibratedClassifierCV(clf, method='isotonic', cv=utils.get_cv(labels))
    fitted_classifier = calibrated_classifier.fit(train, labels)
    predictions = fitted_classifier.predict_proba(test)
    utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH,
                          os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'),
                          predictions)
elif MODE == 'holdout':
    score = utils.hold_out_evaluation(clf, train, labels, calibrate=False, test_size=0.9)
    print 'Log loss:', score
elif MODE == 'tune':
    train, labels, valid, valid_labels = utils.stratified_split(train, labels, test_size=.8)
    from sklearn.metrics import log_loss
    # Objective function
    def objective(args):
        c, gamma = args
        clf = OneVsRestClassifier(svm.SVC(C=c, kernel='rbf', tol=.001, gamma=gamma,
                                  probability=True, random_state=23))
        score1 = 0
        score2 = utils.hold_out_evaluation(clf, train, labels, calibrate=False)
        score = log_loss(valid_labels, clf.predict_proba(valid))
        print 'C=%f, gamma=%f, score1=%f, score2=%f, score=%f' % (c, gamma, score1, score2, score)
        return score
    # Searching space