def train_classifier(self): """ Train classifier and save to disk :return: """ feature_extractor = FeatureExtractor.tfidf(ngram_range=(1, 2), max_df=0.5, use_idf=False, sublinear_tf=True) clf = SVC(kernel='linear', C=100, gamma=0.01, decision_function_shape='ovo', probability=True) counts, targets = feature_extractor.extract_features_from_csv print('start training...') clf.fit(counts, targets) # train the classifier print('training finished. start dumping model...') # save model and classifier to disk joblib.dump(clf, self.resource_path + 'booking_classifier.pkl') joblib.dump(feature_extractor, self.resource_path + 'booking_features.pkl') self.load_model()
def classify(bow=False, plot=False, multinomial_nb=False, bernoulli_nb=False, knn=False, support_vm=False, svm_sgd=False, decision_tree=False, random_forest=False, persist=False, logistic_regression=False): """ Validate the classifier against unseen resources using k-fold cross validation """ if multinomial_nb: clf_title = 'Multinomial NB' if bow: vectorizer_title = 'Bag-of-Words' counts, targets = FeatureExtractor.bow( max_df=0.25, ngram_range=(1, 3)).extract_features_from_csv clf = MultinomialNB(alpha=1e-05) else: vectorizer_title = 'TF-IDF' counts, targets = FeatureExtractor.tfidf( analyzer='char', max_df=0.5, ngram_range=(1, 4), norm='l1', sublinear_tf=False, use_idf=True).extract_features_from_csv clf = MultinomialNB(alpha=1e-07) elif bernoulli_nb: clf_title = 'Bernoulli NB' if bow: vectorizer_title = 'Bag-of-Words' counts, targets = FeatureExtractor.bow( max_df=0.25, ngram_range=(1, 3)).extract_features_from_csv clf = BernoulliNB(alpha=1e-05) else: vectorizer_title = 'TF-IDF' counts, targets = FeatureExtractor.tfidf( analyzer='word', max_df=0.25, ngram_range=(1, 3), norm='l1', sublinear_tf=True, use_idf=True).extract_features_from_csv clf = BernoulliNB(alpha=1e-05) elif knn: clf_title = 'K-Nearest-Neighbour' if bow: vectorizer_title = 'Bag-of-Words' counts, targets = FeatureExtractor.bow( max_df=0.5, ngram_range=(1, 1)).extract_features_from_csv clf = KNeighborsClassifier(weights='distance', n_neighbors=2, leaf_size=20, algorithm='auto') else: vectorizer_title = 'TF-IDF' counts, targets = FeatureExtractor.tfidf( analyzer='word', max_df=0.5, ngram_range=(1, 1), norm='l1', sublinear_tf=True, use_idf=True).extract_features_from_csv clf = KNeighborsClassifier(weights='distance', n_neighbors=2, leaf_size=20, algorithm='auto') elif support_vm: clf_title = 'Support Vector Machine' if bow: vectorizer_title = 'Bag-of-Words' counts, targets = FeatureExtractor.tfidf( max_df=0.5, ngram_range=(1, 2)).extract_features_from_csv clf = SVC(kernel='sigmoid', C=100, gamma=0.01, decision_function_shape='ovo', probability=True) else: vectorizer_title = 'TF-IDF' counts, targets = FeatureExtractor.tfidf( analyzer='char', max_df=1.0, ngram_range=(1, 4), norm='l2', use_idf=False, sublinear_tf=True).extract_features_from_csv clf = SVC(kernel='sigmoid', C=10, gamma=1.4, decision_function_shape='ovo', probability=True) elif svm_sgd: clf_title = 'SVM (SGD)' if bow: vectorizer_title = 'Bag-of-Words' counts, targets = FeatureExtractor.bow( max_df=0.25, ngram_range=(1, 4)).extract_features_from_csv target_ints = [] for target in targets: target_ints.append(category_names_reverse.index(target)) class_weights = get_classweight(targets) clf = SGDClassifier(loss='squared_hinge', penalty='l1', alpha=1e-05, max_iter=100, tol=0.2, class_weight=class_weights) else: vectorizer_title = 'TF-IDF' counts, targets = FeatureExtractor.tfidf( ngram_range=(1, 4), max_df=0.25, use_idf=False, sublinear_tf=True).extract_features_from_csv target_ints = [] for target in targets: target_ints.append(category_names_reverse.index(target)) class_weights = get_classweight(targets) clf = SGDClassifier(loss='hinge', penalty='l1', alpha=1e-05, max_iter=100, tol=0.2, class_weight=class_weights) else: print('Please provide a classifer algorithm') return clf.fit(counts, targets) if persist: joblib.dump(clf, clf_title + '.pkl') # scores ac_scores = [] f1_scores = [] prec_scores = [] rec_scores = [] confusion = np.array([[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]) cv = list( StratifiedKFold(n_splits=15, random_state=1).split(counts, targets)) for k, (train_indices, test_indices) in enumerate(cv): train_text = counts[train_indices] train_y = targets[train_indices] test_text = counts[test_indices] test_y = targets[test_indices] clf.fit(train_text, train_y) predictions = clf.predict(test_text) confusion += confusion_matrix(test_y, predictions) ac_scores.append(accuracy_score(test_y, predictions)) f1_scores.append(f1_score(test_y, predictions, average="macro")) prec_scores.append( precision_score(test_y, predictions, average="macro")) rec_scores.append(recall_score(test_y, predictions, average="macro")) print("---------------------- \nResults for ", clf_title, " with ", vectorizer_title, ":") print("K-Folds Accuracy-score: ", sum(ac_scores) / len(ac_scores)) print("K-Folds F1-score: ", sum(f1_scores) / len(f1_scores)) print("K-Folds Precision-score: ", sum(prec_scores) / len(prec_scores)) print("K-Folds Recall-score: ", sum(rec_scores) / len(rec_scores)) print("CV accuracy : %.3f +/- %.3f" % (np.mean(ac_scores), np.std(ac_scores))) labels = [ 'Barent.', 'Finanzen', 'Freiz.&Lifes.', 'Lebensh.', 'Mob.&Verk.', 'Versich.', 'Wohn.&Haus.' ] if plot: Plotter.plot_and_show_confusion_matrix(confusion, labels, normalize=True, title=clf_title, save=True)
def plot_validation_curve(): """ Plots the validation curve for a given range of parameters (param_range) This is intedet for limiting the values used in grid search (method estimate_parameters) """ counts, targets = FeatureExtractor.tfidf(ngram_range=(1,4), max_df=0.5, use_idf=False, sublinear_tf=True)\ .extract_features_from_csv # split data into test and training set - hold 20% out for testing X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.2, random_state=1) # example usage validating param range for 'alpha' of SGDClassifier pipeline = Pipeline([('clf', SGDClassifier())]) param_range = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1] train_scores, test_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, param_name='clf__alpha', param_range=param_range, cv=10) print(train_scores) print(test_scores) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter alpha') plt.ylabel('Accuracy') plt.ylim([0.2, 1.0]) plt.show()