from sklearn.naive_bayes import BernoulliNB from run_binary_classifier import run param_grid = { 'bag_of_words__stop_words': ['english'], 'bag_of_words__ngram_range': [(1, 2)], 'bag_of_words__max_features': [500], 'dim_reduct__n_components': [300], 'normalizer__norm': ['l2'], 'classifier__alpha': [1.0], 'classifier__binarize': [0.0] } clf = BernoulliNB() run(param_grid, clf)
def keras_logreg_model(): model = Sequential() model.add( Dense(units=1, input_shape=(2, ), kernel_initializer='normal', kernel_regularizer=regularizers.l2(1.), activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model param_grid = { 'bag_of_words__stop_words': ['english'], 'bag_of_words__ngram_range': [(1, 2)], 'bag_of_words__max_features': [500], #'dim_reduct__n_components': [300], 'normalizer__norm': ['l2'] #'classifier__C': [5., 10.] } estimator = KerasClassifier(build_fn=keras_logreg_model, epochs=1, batch_size=5, verbose=1) run(param_grid, estimator)
multilabel_param_grid = [{ 'estimator__bag_of_words__stop_words': ['english'], 'estimator__bag_of_words__ngram_range': [(1, 2)], 'estimator__bag_of_words__max_features': [500], 'estimator__dim_reduct__n_components': [300], 'estimator__normalizer__norm': ['l2'], 'estimator__classifier__C': [5., 10.] }] # =========================== # # TRAIN # BINARY CLASSIFIER # =========================== # binary_clf = run_binary_classifier.run(binary_param_grid, LogisticRegression(), comments_file=train_binary) with open('./saved_models/log_reg_joint_binary.pkl', 'wb') as saved_model: pickle.dump(binary_clf, file=saved_model) # =========================== # # TRAIN # MULTILABEL CLASSIFIER # =========================== # multilabel_clf = run_multilabel_classifier.run(multilabel_param_grid, LogisticRegression(), comments_file=train_multilabel) with open('./saved_models/log_reg_joint_multilabel.pkl', 'wb') as saved_model: pickle.dump(binary_clf, file=saved_model) # =========================== # # PREDICT
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from run_binary_classifier import _load_comments, run train_comments_path = os.path.join('../../../', 'data/train_binary.csv') test_comments_path = os.path.join('../../../', 'data/test_clean_binary.csv') param_grid = { 'bag_of_words__stop_words': ['english'], 'bag_of_words__ngram_range': [(1, 2)], 'bag_of_words__max_features': [500], 'dim_reduct__n_components': [300], 'normalizer__norm': ['l2'], 'classifier__C': [5., 10.] } clf = LogisticRegression() trained_clf = run(param_grid, clf, comments_file=train_comments_path) with open('./saved_models/log_reg_trained_binary.pkl', 'wb') as saved_model: pickle.dump(trained_clf, file=saved_model) with open('./saved_models/log_reg_trained_binary.pkl', 'rb') as saved_model: loaded_clf = pickle.load(saved_model) X_test, y_test = _load_comments(test_comments_path) y_test_predict = loaded_clf.predict(X_test) print(classification_report(y_test, y_test_predict))