def train(train_set, comment_type, vocab, **skip_clf): global clf_rf, clf_gb, clf_xgb, clf_ext print("Start to train comment type: %s" % comment_type) # Feature engineering train_set = misc.feature_engineering(train_set, vocab, comment_type) # Modeling # train_set, val_set = train_test_split(train_set, test_size=0.2, random_state=0) trainX = train_set.drop(['score', 'id', 'comment_text'], axis=1).loc[:] trainY = np.ravel(train_set.loc[:, ['score']]) # testX = val_set.drop(['score', 'id', 'comment_text'], axis=1).loc[:] # testY = np.ravel(val_set.loc[:, ['score']]) # clf_svm = train_SVM(clf_svm, trainX, trainY, 'SVM_%s' % comment_type) skip_RF = True if ('skip_RF' in skip_clf and skip_clf['skip_RF'] is True) else False skip_GBDT = True if ('skip_GBDT' in skip_clf and skip_clf['skip_GBDT'] is True) else False skip_XGB = True if ('skip_XGB' in skip_clf and skip_clf['skip_XGB'] is True) else False skip_ExtTree = True if ('skip_ExtTree' in skip_clf and skip_clf['skip_ExtTree'] is True) else False skip_CV = True if ('skip_CV' in skip_clf and skip_clf['skip_CV'] is True) else False clf_rf = train_RF(clf_rf, trainX, trainY, 'RandomForest_%s' % comment_type, skip=skip_RF) clf_gb = train_GBDT(clf_gb, trainX, trainY, 'GBDT_%s' % comment_type, skip=skip_GBDT) clf_xgb = train_XGB(clf_xgb, trainX, trainY, 'XGBoost_%s' % comment_type, skip=skip_XGB) clf_ext = train_EXT(clf_ext, trainX, trainY, 'ExtraTree_%s' % comment_type, skip=skip_ExtTree) clf_vote_soft = run_ensemble(clf_rf, clf_gb, clf_xgb, clf_ext, trainX, trainY, 'Ensemble_%s' % comment_type, skip_cv=skip_CV) return clf_vote_soft
def predict(test_set, comment_type, vocab, estimator, use_proba=False): # Feature engineering test_set = misc.feature_engineering(test_set, vocab, comment_type, is_test=True) df_test = test_set.drop(['id', 'comment_text'], axis=1).loc[:] if use_proba: result = estimator.predict_proba(df_test) else: result = estimator.predict(df_test) return result[:, 1]