예제 #1
0
    def train_classifier(self):
        """
        Train classifier and save to disk
        :return:
        """
        feature_extractor = FeatureExtractor.tfidf(ngram_range=(1, 2),
                                                   max_df=0.5,
                                                   use_idf=False,
                                                   sublinear_tf=True)
        clf = SVC(kernel='linear',
                  C=100,
                  gamma=0.01,
                  decision_function_shape='ovo',
                  probability=True)
        counts, targets = feature_extractor.extract_features_from_csv

        print('start training...')
        clf.fit(counts, targets)  # train the classifier
        print('training finished. start dumping model...')

        # save model and classifier to disk
        joblib.dump(clf, self.resource_path + 'booking_classifier.pkl')
        joblib.dump(feature_extractor,
                    self.resource_path + 'booking_features.pkl')
        self.load_model()
def classify(bow=False,
             plot=False,
             multinomial_nb=False,
             bernoulli_nb=False,
             knn=False,
             support_vm=False,
             svm_sgd=False,
             decision_tree=False,
             random_forest=False,
             persist=False,
             logistic_regression=False):
    """
    Validate the classifier against unseen resources using k-fold cross validation
    """

    if multinomial_nb:
        clf_title = 'Multinomial NB'
        if bow:
            vectorizer_title = 'Bag-of-Words'
            counts, targets = FeatureExtractor.bow(
                max_df=0.25, ngram_range=(1, 3)).extract_features_from_csv
            clf = MultinomialNB(alpha=1e-05)
        else:
            vectorizer_title = 'TF-IDF'
            counts, targets = FeatureExtractor.tfidf(
                analyzer='char',
                max_df=0.5,
                ngram_range=(1, 4),
                norm='l1',
                sublinear_tf=False,
                use_idf=True).extract_features_from_csv
            clf = MultinomialNB(alpha=1e-07)
    elif bernoulli_nb:
        clf_title = 'Bernoulli NB'
        if bow:
            vectorizer_title = 'Bag-of-Words'
            counts, targets = FeatureExtractor.bow(
                max_df=0.25, ngram_range=(1, 3)).extract_features_from_csv
            clf = BernoulliNB(alpha=1e-05)
        else:
            vectorizer_title = 'TF-IDF'
            counts, targets = FeatureExtractor.tfidf(
                analyzer='word',
                max_df=0.25,
                ngram_range=(1, 3),
                norm='l1',
                sublinear_tf=True,
                use_idf=True).extract_features_from_csv
            clf = BernoulliNB(alpha=1e-05)
    elif knn:
        clf_title = 'K-Nearest-Neighbour'
        if bow:
            vectorizer_title = 'Bag-of-Words'
            counts, targets = FeatureExtractor.bow(
                max_df=0.5, ngram_range=(1, 1)).extract_features_from_csv
            clf = KNeighborsClassifier(weights='distance',
                                       n_neighbors=2,
                                       leaf_size=20,
                                       algorithm='auto')
        else:
            vectorizer_title = 'TF-IDF'
            counts, targets = FeatureExtractor.tfidf(
                analyzer='word',
                max_df=0.5,
                ngram_range=(1, 1),
                norm='l1',
                sublinear_tf=True,
                use_idf=True).extract_features_from_csv
            clf = KNeighborsClassifier(weights='distance',
                                       n_neighbors=2,
                                       leaf_size=20,
                                       algorithm='auto')

    elif support_vm:
        clf_title = 'Support Vector Machine'
        if bow:
            vectorizer_title = 'Bag-of-Words'
            counts, targets = FeatureExtractor.tfidf(
                max_df=0.5, ngram_range=(1, 2)).extract_features_from_csv
            clf = SVC(kernel='sigmoid',
                      C=100,
                      gamma=0.01,
                      decision_function_shape='ovo',
                      probability=True)
        else:
            vectorizer_title = 'TF-IDF'
            counts, targets = FeatureExtractor.tfidf(
                analyzer='char',
                max_df=1.0,
                ngram_range=(1, 4),
                norm='l2',
                use_idf=False,
                sublinear_tf=True).extract_features_from_csv

            clf = SVC(kernel='sigmoid',
                      C=10,
                      gamma=1.4,
                      decision_function_shape='ovo',
                      probability=True)
    elif svm_sgd:
        clf_title = 'SVM (SGD)'
        if bow:
            vectorizer_title = 'Bag-of-Words'
            counts, targets = FeatureExtractor.bow(
                max_df=0.25, ngram_range=(1, 4)).extract_features_from_csv
            target_ints = []
            for target in targets:
                target_ints.append(category_names_reverse.index(target))
            class_weights = get_classweight(targets)
            clf = SGDClassifier(loss='squared_hinge',
                                penalty='l1',
                                alpha=1e-05,
                                max_iter=100,
                                tol=0.2,
                                class_weight=class_weights)
        else:
            vectorizer_title = 'TF-IDF'
            counts, targets = FeatureExtractor.tfidf(
                ngram_range=(1, 4),
                max_df=0.25,
                use_idf=False,
                sublinear_tf=True).extract_features_from_csv
            target_ints = []
            for target in targets:
                target_ints.append(category_names_reverse.index(target))
            class_weights = get_classweight(targets)
            clf = SGDClassifier(loss='hinge',
                                penalty='l1',
                                alpha=1e-05,
                                max_iter=100,
                                tol=0.2,
                                class_weight=class_weights)
    else:
        print('Please provide a classifer algorithm')
        return

    clf.fit(counts, targets)

    if persist:
        joblib.dump(clf, clf_title + '.pkl')

    # scores
    ac_scores = []
    f1_scores = []
    prec_scores = []
    rec_scores = []

    confusion = np.array([[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0]])
    cv = list(
        StratifiedKFold(n_splits=15, random_state=1).split(counts, targets))

    for k, (train_indices, test_indices) in enumerate(cv):
        train_text = counts[train_indices]
        train_y = targets[train_indices]

        test_text = counts[test_indices]
        test_y = targets[test_indices]

        clf.fit(train_text, train_y)
        predictions = clf.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)

        ac_scores.append(accuracy_score(test_y, predictions))
        f1_scores.append(f1_score(test_y, predictions, average="macro"))
        prec_scores.append(
            precision_score(test_y, predictions, average="macro"))
        rec_scores.append(recall_score(test_y, predictions, average="macro"))

    print("---------------------- \nResults for ", clf_title, " with ",
          vectorizer_title, ":")
    print("K-Folds Accuracy-score: ", sum(ac_scores) / len(ac_scores))
    print("K-Folds F1-score: ", sum(f1_scores) / len(f1_scores))
    print("K-Folds Precision-score: ", sum(prec_scores) / len(prec_scores))
    print("K-Folds Recall-score: ", sum(rec_scores) / len(rec_scores))

    print("CV accuracy : %.3f +/- %.3f" %
          (np.mean(ac_scores), np.std(ac_scores)))

    labels = [
        'Barent.', 'Finanzen', 'Freiz.&Lifes.', 'Lebensh.', 'Mob.&Verk.',
        'Versich.', 'Wohn.&Haus.'
    ]
    if plot:
        Plotter.plot_and_show_confusion_matrix(confusion,
                                               labels,
                                               normalize=True,
                                               title=clf_title,
                                               save=True)
def plot_validation_curve():
    """
    Plots the validation curve for a given range of parameters (param_range)
    This is intedet for limiting the values used in grid search
    (method estimate_parameters)
    """
    counts, targets = FeatureExtractor.tfidf(ngram_range=(1,4), max_df=0.5,
                                             use_idf=False, sublinear_tf=True)\
                                             .extract_features_from_csv

    # split data into test and training set - hold 20% out for testing
    X_train, X_test, y_train, y_test = train_test_split(counts,
                                                        targets,
                                                        test_size=0.2,
                                                        random_state=1)

    # example usage validating param range for 'alpha' of SGDClassifier
    pipeline = Pipeline([('clf', SGDClassifier())])
    param_range = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1]

    train_scores, test_scores = validation_curve(estimator=pipeline,
                                                 X=X_train,
                                                 y=y_train,
                                                 param_name='clf__alpha',
                                                 param_range=param_range,
                                                 cv=10)
    print(train_scores)
    print(test_scores)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range,
             train_mean,
             color='blue',
             marker='o',
             markersize=5,
             label='training accuracy')
    plt.fill_between(param_range,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15,
                     color='blue')
    plt.plot(param_range,
             test_mean,
             color='green',
             linestyle='--',
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(param_range,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15,
                     color='green')

    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    plt.xlabel('Parameter alpha')
    plt.ylabel('Accuracy')
    plt.ylim([0.2, 1.0])
    plt.show()