def train_data(): dummy_clf = pipelines.bag_of_words( classifier=DummyClassifier(random_state=0, strategy="stratified")) log_regression = pipelines.bag_of_words(classifier=LogisticRegression( random_state=0, n_jobs=-1, max_iter=2000)) log_regression_tfidf = pipelines.bag_of_words( classifier=LogisticRegression(random_state=0, n_jobs=-1, max_iter=2000), tf_idf=True) linear_svc = pipelines.bag_of_words( classifier=LinearSVC(max_iter=2000, random_state=0)) linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC( max_iter=2000, random_state=0), tf_idf=True) # svc = pipelines.bag_of_words( # classifier=SVC(gamma='scale', kernel='rbf') # ) bow_pipelines = [("Binary: BoW + Dummy", dummy_clf), ("Binary: BoW + LR", log_regression), ("Binary: BoW + LR + TFIDF", log_regression_tfidf), ("Binary: BoW + LinearSVC", linear_svc), ("Binary: BoW + LinearSVC + TFIDF", linear_svc_tfidf) # ("Binary: BoW + SVC", svc) ] for name, model in bow_pipelines: model.set_params(vect__ngram_range=(1, 4)) yield (name, model)
def bag_of_words_pipeline(): log_regression = pipelines.bag_of_words(classifier=LogisticRegression( C=10.0)) log_regression_tfidf = pipelines.bag_of_words( classifier=LogisticRegression(C=10.0), tf_idf=True) bow_pipelines = [("BoW + LR", log_regression), ("BoW + LR + TFIDF", log_regression_tfidf)] for name, pipe in bow_pipelines: pipe.set_params(vect__max_features=50000) yield (name, pipe)
def bag_of_words_pipeline(): log_regression = pipelines.bag_of_words( classifier=LogisticRegression(random_state=0, solver='saga', multi_class='ovr', n_jobs=-1, max_iter=2000)) log_regression.set_params(vect__ngram_range=(1, 4)) return log_regression
def gradient_boosting_pipeline(): gradent_boosting = pipelines.bag_of_words( classifier=GradientBoostingClassifier( n_estimators=5000, learning_rate=0.2, random_state=10)) gradent_boosting_tfidf = pipelines.bag_of_words( classifier=GradientBoostingClassifier(n_estimators=5000, learning_rate=0.2, random_state=10), tf_idf=True) gb_pipelines = [("BoN + GB", gradent_boosting), ("BoN + GB + TFIDF", gradent_boosting_tfidf)] for name, pipe in gb_pipelines: pipe.set_params( vect__ngram_range=(1, 5), vect__max_features=500000, ) yield (name, pipe)
def bag_of_ngrams_pipelines(): log_regression = pipelines.bag_of_words( classifier=LogisticRegression(C=10.0), ) log_regression_tfidf = pipelines.bag_of_words( classifier=LogisticRegression(C=10.0), tf_idf=True) linear_svc = pipelines.bag_of_words(classifier=LinearSVC(), ) linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC(), tf_idf=True) multinomial_nb = pipelines.bag_of_words(classifier=MultinomialNB(), ) multinomial_nb_tfidf = pipelines.bag_of_words(classifier=MultinomialNB(), tf_idf=True) bon_pipelines = [ ("BoN + LR", log_regression), ("BoN + LR + TFIDF", log_regression_tfidf), ("BoN + SVC", linear_svc), ("BoN + SVC + TFIDF", linear_svc_tfidf), ("BoN + MNB", multinomial_nb), ("BoN + MNB + TFIDF", multinomial_nb_tfidf), ] for name, pipe in bon_pipelines: pipe.set_params(vect__ngram_range=(1, 5), vect__max_features=500000) yield (name, pipe)
def bag_of_words_pipeline(): log_regression = pipelines.bag_of_words( classifier=LogisticRegression(random_state=0, solver='saga', multi_class='ovr', n_jobs=-1, max_iter=2000)) log_regression_tfidf = pipelines.bag_of_words( classifier=LogisticRegression(random_state=0, solver='saga', multi_class='ovr', n_jobs=-1, max_iter=2000), tf_idf=True) multinomial_nb = pipelines.bag_of_words(classifier=MultinomialNB()) multinomial_nb_tfidf = pipelines.bag_of_words(classifier=MultinomialNB(), tf_idf=True) linear_svc = pipelines.bag_of_words( classifier=LinearSVC(multi_class='ovr', max_iter=2000, random_state=0)) linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC( multi_class='ovr', max_iter=2000, random_state=0), tf_idf=True) # tune: max_depth, min_child_weight, n_estimatorss xgb = pipelines.bag_of_words( classifier=XGBClassifier(learning_rate=0.2, n_estimators=1000, max_depth=7, objective='multi:softprob', n_class=3, n_jobs=-1, random_state=0, min_child_weight=3)) xgb_tfidf = pipelines.bag_of_words(classifier=XGBClassifier( learning_rate=0.2, n_estimators=1000, max_depth=7, objective='multi:softprob', n_class=3, n_jobs=-1, random_state=0, min_child_weight=3), tf_idf=True) sgd_classifier = pipelines.bag_of_words( classifier=SGDClassifier(max_iter=2000, n_jobs=-1, random_state=0)) sgd_classifier_tfidf = pipelines.bag_of_words(classifier=SGDClassifier( max_iter=2000, n_jobs=-1, random_state=0), tf_idf=True) bow_pipelines = [("BoW + LR", log_regression), ("BoW + LR + TFIDF", log_regression_tfidf), ("BoW + MNB", multinomial_nb), ("BoW + MNB + TFIDF", multinomial_nb_tfidf), ("BoW + LinearSVC", linear_svc), ("BoW + LinearSVC + TFIDF", linear_svc_tfidf), ("BoW + XGBoost", xgb), ("BoW + XGBoost + TFIDF", xgb_tfidf), ("BoW + SGDClassifier", sgd_classifier), ("BoW + SGDClassifier + TFIDF", sgd_classifier_tfidf)] for name, model in bow_pipelines: model.set_params(vect__ngram_range=(1, 4)) yield (name, model)