示例#1
0
def train(train_set, comment_type, vocab, **skip_clf):
    global clf_rf, clf_gb, clf_xgb, clf_ext
    print("Start to train comment type: %s" % comment_type)

    # Feature engineering
    train_set = misc.feature_engineering(train_set, vocab, comment_type)

    # Modeling
    # train_set, val_set = train_test_split(train_set, test_size=0.2, random_state=0)
    trainX = train_set.drop(['score', 'id', 'comment_text'], axis=1).loc[:]
    trainY = np.ravel(train_set.loc[:, ['score']])
    # testX = val_set.drop(['score', 'id', 'comment_text'], axis=1).loc[:]
    # testY = np.ravel(val_set.loc[:, ['score']])

    # clf_svm = train_SVM(clf_svm, trainX, trainY, 'SVM_%s' % comment_type)
    skip_RF = True if ('skip_RF' in skip_clf
                       and skip_clf['skip_RF'] is True) else False
    skip_GBDT = True if ('skip_GBDT' in skip_clf
                         and skip_clf['skip_GBDT'] is True) else False
    skip_XGB = True if ('skip_XGB' in skip_clf
                        and skip_clf['skip_XGB'] is True) else False
    skip_ExtTree = True if ('skip_ExtTree' in skip_clf
                            and skip_clf['skip_ExtTree'] is True) else False
    skip_CV = True if ('skip_CV' in skip_clf
                       and skip_clf['skip_CV'] is True) else False

    clf_rf = train_RF(clf_rf,
                      trainX,
                      trainY,
                      'RandomForest_%s' % comment_type,
                      skip=skip_RF)
    clf_gb = train_GBDT(clf_gb,
                        trainX,
                        trainY,
                        'GBDT_%s' % comment_type,
                        skip=skip_GBDT)
    clf_xgb = train_XGB(clf_xgb,
                        trainX,
                        trainY,
                        'XGBoost_%s' % comment_type,
                        skip=skip_XGB)
    clf_ext = train_EXT(clf_ext,
                        trainX,
                        trainY,
                        'ExtraTree_%s' % comment_type,
                        skip=skip_ExtTree)

    clf_vote_soft = run_ensemble(clf_rf,
                                 clf_gb,
                                 clf_xgb,
                                 clf_ext,
                                 trainX,
                                 trainY,
                                 'Ensemble_%s' % comment_type,
                                 skip_cv=skip_CV)
    return clf_vote_soft
示例#2
0
def predict(test_set, comment_type, vocab, estimator, use_proba=False):
    # Feature engineering
    test_set = misc.feature_engineering(test_set,
                                        vocab,
                                        comment_type,
                                        is_test=True)

    df_test = test_set.drop(['id', 'comment_text'], axis=1).loc[:]
    if use_proba:
        result = estimator.predict_proba(df_test)
    else:
        result = estimator.predict(df_test)
    return result[:, 1]