Exemplo n.º 1
0
def test5():
    print("\n\nTest 5 - Algorithm Tweaks (Bias & Variance)")
    print("Expected / Actual:")

    print("\nRegularized Linear Regression: ")
    X, y = ut.read_mat('mat/ex5data1.mat')
    X = ut.create_design(X)
    theta = np.array([1, 1])
    print("303.993 / ", alg.SSD(theta, X, y, 1))
    grad = alg.SSD_gradient(theta, X, y, 1)
    print("-15.30 / ", grad[0])
    print("598.250 / ", grad[1])

    print("\nLearning Curve:")
    raw = ut.read_mat_raw('mat/ex5data1.mat')
    X = raw['X']
    y = raw['y'].reshape(-1)

    Xval = raw['Xval']
    yval = raw['yval'].reshape(-1)
    print("Check plot")
    # pt.plot_learning_curve(ut.create_design(X), y, ut.create_design(Xval), yval, 0)

    print("\nFitting polynomial regression:")
    p = 8
    X_poly = ut.poly_features(X, p)
    X_poly, mu, sigma = ut.normalize_features(X_poly)
    X_poly = ut.create_design(X_poly)

    Xval = ut.poly_features(Xval, p)
    Xval -= mu
    Xval /= sigma
    Xval = ut.create_design(Xval)

    l = 0.01
    theta = alg.parametrize_linear(X_poly, y, l)

    print("Check plot, l =", l)
    pt.fit_plot(X, y, mu, sigma, theta, p)
    pt.plot_learning_curve(X_poly, y, Xval, yval, l)

    print("\nOptimize regularization:")
    print("Check plot")

    l = pt.plot_validation_curve(X_poly, y, Xval, yval)

    Xtest = raw['Xtest']
    ytest = raw['ytest'].reshape(-1)
    Xtest = ut.poly_features(Xtest, p)
    Xtest -= mu
    Xtest /= sigma
    Xtest = ut.create_design(Xtest)

    theta = alg.parametrize_linear(X_poly, y, l)
    print("3.8599 / ", alg.SSD(theta, Xtest, ytest, 0))

    print("\nRandomized learning curve:")
    print("Check plot")
    pt.plot_randomized_learning_curve(X_poly, y, Xval, yval, 0.01)
    return
Exemplo n.º 2
0
def plot_learning_curves_across_topics(n_runs, start_idx, stop_idx, estimators_dict, comment=None):
  """
  TODO Most probably buggy
  """
  for topic_id, data in texts_vote_lists_truths_by_topic_id.iteritems():
    print 'Loading topic %s' % topic_id
    texts, vote_lists, truths = data
    n_documents = len(texts)

    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(texts)
    text_similarity = cosine_similarity(tfidf)

    x = np.arange(start_idx, stop_idx)

    y_by_estimator = dict( (estimator, []) for estimator in estimators_dict.keys() )

    for estimator_name, estimator_and_args in estimators_dict.iteritems():
      print 'Calculating for %s' % estimator_name
      estimator, args, active_pars = estimator_and_args
      if active_pars is None:
        sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
          vote_lists, truths, text_similarity, idx, False, *args) for idx in xrange(n_runs) )
      else:
        sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, 
          vote_lists, truths, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) )      

      good_slices = [ s[start_idx:] for s in sequences if s is not None ]
      if good_slices:
        results = np.vstack(good_slices)

        begin_accuracies = results[:, 0]
        end_accuracies = results[:, -1]
        
        begin_accuracies.dump("pickles/%s-%s-begin-accuracies--.pkl" % (topic_id, estimator_name) )
        end_accuracies.dump("pickles/%s-%s-end-accuracies--.pkl" % (topic_id, estimator_name))

        # We will then need to vstack and avg though all the topic accuracies for each estimator
        y_by_estimator[estimator_name].append( np.mean(results, axis=0) )
      else:
        print 'Topic %s is not represented with estimator %s' % (topic_id, estimator_name)

    result_by_estimator = {}

    for estimator_name, mean_accuracy_sequences in y_by_estimator.iteritems():
      if mean_accuracy_sequences:
        to_avg = np.vstack(mean_accuracy_sequences)
        result_by_estimator[estimator_name] = np.mean(to_avg, axis=0)
      else:
        print "Nope"
  if comment:
    title = 'Across topics, %s runs, %s' % (n_runs, comment)
  else:
    title = 'Across topics, %s runs' % topic_id
  plot_learning_curve(title, x, result_by_estimator, 'Votes sampled', 'Accuracy')
Exemplo n.º 3
0
    def cross_validate(self, test_size=0.25, n_iter=100):
        cv = cross_validation.ShuffleSplit(self.dataset.matrix.shape[0],
                                           n_iter=n_iter, test_size=test_size)

        title = "Learning Curves (Logistic Regression)"
        plot_learning_curve(self.clf, title,
                        self.dataset.matrix, self.dataset.labels,
                        cv=cv, n_jobs=4)        

        scores = cross_validation.cross_val_score(self.clf,
                                                  self.dataset.matrix,
                                                  self.dataset.labels,
                                                  cv=cv)
        print scores
        print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)
Exemplo n.º 4
0
def plot_learning_curves_for_topic(topic_id, n_runs, votes_per_doc, estimators_dict, comment=None):
  texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
  n_documents = len(texts)

  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(texts)
  text_similarity = cosine_similarity(X)

  min_votes_per_doc, max_votes_per_doc = votes_per_doc
  start_idx, stop_idx = int(min_votes_per_doc * n_documents), int(max_votes_per_doc * n_documents)
  x = np.arange(float(start_idx), float(stop_idx)) / n_documents

  estimator_y = {}

  for estimator_name, estimator_and_args in estimators_dict.iteritems():
    print 'Calculating for %s' % estimator_name
    estimator, args, active_pars = estimator_and_args
    if active_pars is None:
      sequences = Parallel(n_jobs=N_CORES)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
        vote_lists, truths, X, text_similarity, idx, False, *args) for idx in xrange(n_runs) )
    else:
      sequences = Parallel(n_jobs=N_CORES)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, 
        vote_lists, truths, X, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) )      

    good_slices = [ s[start_idx:] for s in sequences if s is not None ]
    if good_slices:
      results = np.vstack(good_slices)

      # Pickling is not necessary yet
      '''
      begin_accuracies = results[:, 0]
      middle_accuracies = results[:, int(results.shape[1] / 2)]
      end_accuracies = results[:, -1]

      begin_accuracies.dump("pickles/%s-%s-begin-accuracies---.pkl" % (topic_id, estimator_name) )
      '''

      estimator_y[estimator_name] = np.mean(results, axis=0)
    else:
      print 'Query %s is not represented with estimator %s' % (topic_id, estimator_name)

  if comment:
    title = 'Query %s, %s runs, %s' % (topic_id, n_runs, comment)
  else:
    title = 'Query %s, %s runs' % (topic_id, n_runs)
  plot_learning_curve(title, x, estimator_y, 'Votes per document', 'Accuracy')
Exemplo n.º 5
0
    def cross_validate(self, test_size=0.25, n_iter=100):
        cv = cross_validation.ShuffleSplit(self.dataset.matrix.shape[0],
                                           n_iter=n_iter,
                                           test_size=test_size)

        title = "Learning Curves (Logistic Regression)"
        plot_learning_curve(self.clf,
                            title,
                            self.dataset.matrix,
                            self.dataset.labels,
                            cv=cv,
                            n_jobs=4)

        scores = cross_validation.cross_val_score(self.clf,
                                                  self.dataset.matrix,
                                                  self.dataset.labels,
                                                  cv=cv)
        print scores
        print 'Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)
Exemplo n.º 6
0
    def run_experiment(self, dataset):
        # SVM
        if dataset.dataset_name == 'Diabetes Data Set':
            svm_simple = SVC()
            svm_simple.fit(dataset.train_x, dataset.train_y)
            y_pred = svm_simple.predict(dataset.test_x)
            print(classification_report(dataset.test_y, y_pred))
            # Create SVM classifier
            self.learner_model = SVC(C=15.0, kernel='linear', degree=3, gamma='scale',
                                     coef0=0.0, shrinking=True, probability=False,
                                     tol=1e-3, cache_size=200, class_weight=None,
                                     max_iter=-1, decision_function_shape='ovr',
                                     random_state=dataset.randomness
                                     )
            # Fit the classifier to the data
            self.learner_model.fit(dataset.train_x, dataset.train_y)
            scores = cross_val_score(self.learner_model, dataset.x, dataset.y, cv=10)
            print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                                      scores.std()),
                  end="\n\n")
            predictions = self.learner_model.predict(dataset.test_x)
            print("Classification Report")
            print(classification_report(predictions, dataset.test_y))
            curve = plots.plot_learning_curve(self.learner_model, "SVM Learning Curve", dataset.x, dataset.y,
                                              cv=5, n_jobs=4)
            curve.show(block=False)
            plt.show()

            ## TRAINING/TEST ACCURACY SCORE
            k_range = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                svm = SVC(C=k, kernel='linear', degree=3, gamma='scale',
                          coef0=0.0, shrinking=True, probability=False,
                          tol=1e-3, cache_size=200, class_weight=None,
                          max_iter=-1, decision_function_shape='ovr',
                          random_state=dataset.randomness
                          )
                svm.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = svm.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = svm.score(dataset.test_x, dataset.test_y)

            # Visualization of k values vs accuracy

            plt.title('SVM: Varying C With Gamma as Scale and Kernel Linear')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('C')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()

            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x, dataset.y, test_size=k, random_state=42)
                start_time = time.time()

                self.learner_model.fit(X_train, y_train)
                y_pred = self.learner_model.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(f'SVM: Varying DataSet Sizes vs Time Taken for Dataset {dataset.dataset_name}')
            plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size')
            plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(f"Average time taken for SVM algorithms is {np.mean(time_taken)}")

            # Using Grid Search CV
            param_grid = {'C': [15, 20, 25, 30, 35, 40, 50],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'rbf']}

            grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3)
            grid.fit(dataset.train_x, dataset.train_y)
            # print best parameter after tuning
            print(f" Best params for the dataset is {grid.best_params_}")
            # print how our model looks after hyper-parameter tuning
            print(grid.best_estimator_)
            grid_predictions = grid.best_estimator_.predict(dataset.test_x)

            # print classification report
            print(
                f"The classification report for the best estimator is {classification_report(dataset.test_y, grid_predictions)}")
            learning_curve = plots.plot_learning_curve(grid.best_estimator_, "SVM Learning Curve For best estimator",
                                                       dataset.x, dataset.y,
                                                       cv=5, n_jobs=4)
            learning_curve.show(block=False)
            plt.show()
        else:

            svm_simple = SVC()
            svm_simple.fit(dataset.train_x, dataset.train_y)
            y_pred = svm_simple.predict(dataset.test_x)
            print(classification_report(dataset.test_y, y_pred))
            # Create SVM classifier
            self.learner_model = SVC(C=10.0, kernel='rbf', degree=3, gamma='scale',
                                     random_state=dataset.randomness)
            # Fit the classifier to the data
            self.learner_model.fit(dataset.train_x, dataset.train_y)
            scores = cross_val_score(self.learner_model, dataset.x, dataset.y, cv=10)
            print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                                      scores.std()),
                  end="\n\n")
            predictions = self.learner_model.predict(dataset.test_x)
            print("Classification Report")
            print(classification_report(predictions, dataset.test_y))
            curve = plots.plot_learning_curve(self.learner_model, "SVM Learning Curve", dataset.x, dataset.y,
                                              cv=5, n_jobs=4)
            curve.show(block=False)
            plt.show()

            ## TRAINING/TEST ACCURACY SCORE
            k_range = np.linspace(5, 15, 10)
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                svm = SVC(C=k, kernel='rbf', degree=3, gamma='auto',
                          random_state=dataset.randomness
                          )
                svm.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = svm.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = svm.score(dataset.test_x, dataset.test_y)

            plt.title('SVM: Varying C With Gamma as Auto and Kernel RBF')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('C')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()

            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x, dataset.y, test_size=k, random_state=42)
                start_time = time.time()

                self.learner_model.fit(X_train, y_train)
                y_pred = self.learner_model.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(f'SVM: Varying DataSet Sizes vs Time Taken for Dataset {dataset.dataset_name}')
            plt.plot(train_sizes, time_taken, label='Time Taken Vs TestData Size')
            plt.plot(train_sizes, accuracy_scores, label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(f"Average time taken for SVM algorithms is {np.mean(time_taken)}")

            # Using Grid Search CV
            param_grid = {'C': [10, 10.5, 10.6, 10.7, 10.8, 10.9, 11],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'rbf']}

            grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3)
            grid.fit(dataset.train_x, dataset.train_y)
            # print best parameter after tuning
            print(f" Best params for the dataset is {grid.best_params_}")
            # print how our model looks after hyper-parameter tuning
            print(grid.best_estimator_)
            grid_predictions = grid.best_estimator_.predict(dataset.test_x)

            # print classification report
            print(
                f"The classification report for the best estimator is {classification_report(dataset.test_y, grid_predictions)}")
            learning_curve = plots.plot_learning_curve(grid.best_estimator_, "SVM Learning Curve For best estimator",
                                                       dataset.x, dataset.y,
                                                       cv=5, n_jobs=4)
            learning_curve.show(block=False)
            plt.show()
# create train-test splits
X, X_test, y, y_test = train_test_split(data_X,
                                        data_y,
                                        test_size=0.2,
                                        random_state=2018)

# plot learning curves with metrics of interest
lc_scoring = ['accuracy', 'precision', 'recall', 'roc_auc']

for scoring in lc_scoring:
    # load default model: Linear SVM with SGD
    clf_supervised = models.default_model()
    plt_handle = plots.plot_learning_curve(
        clf_supervised,
        'Supervised Learning Curve (Scorer: {})'.format(scoring),
        X,
        y,
        cv=5,
        scoring=scoring)
    plt_handle.show()

# train and report test results
clf_supervised = models.default_model()
clf_supervised.fit(X, y)
sup_y_test_preds = clf_supervised.predict(X_test)
supervised_results = {
    'accuracy': metrics.accuracy(y_test, sup_y_test_preds),
    'precision': metrics.precision(y_test, sup_y_test_preds),
    'recall': metrics.recall(y_test, sup_y_test_preds),
    'gmeans': metrics.g_means(y_test, sup_y_test_preds),
    'auc': metrics.auc(y_test, sup_y_test_preds),
Exemplo n.º 8
0
# Draw Histogram of errors on test and train
plots.dualHist(errors1 = rs.predict(X_test) - y_test, 
               errors2 = rs.predict(X_train) - y_train,
               label1 = 'test', 
               label2 = 'train', 
               title = 'Prediction Error: test vs train',
               xlabel = '$',
               hist_range = [-15,15])

#%%
title = 'Learning Curves (Random Forest Regression)'
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

plt = plots.plot_learning_curve(rf, title, X, y, cv=cv, n_jobs=5)

plt.show()
plt.close()

#%%
# Use classifier
from sklearn.ensemble import RandomForestClassifier

y = df['classification_y']

# Split df
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.8,
                                                    shuffle=False)
Exemplo n.º 9
0
    def run_experiment(self, dataset):
        # KNN WITH CROSS VALIDATION
        if dataset.dataset_name == 'Diabetes Data Set':
            k_range = np.arange(40, 50)
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                knn = KNeighborsClassifier(n_neighbors=k,
                                           weights='uniform',
                                           metric='manhattan',
                                           n_jobs=4)
                knn.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y)

            # Visualization of k values vs accuracy

            plt.title('k-NN: Varying Number of Neighbors')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('Number of Neighbors')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()
            # create new a knn model
            knn2 = KNeighborsClassifier(n_jobs=4)
            leaf_range = [1, 2, 3, 4]
            # create a dictionary of all values we want to test for n_neighbors
            param_grid = {
                'n_neighbors': k_range,
                'leaf_size': leaf_range,
                'weights': ['uniform', 'distance'],
                'metric': ['manhattan', 'euclidean'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }
            # use gridsearch to test all values for n_neighbors
            knn_gscv = GridSearchCV(knn2,
                                    param_grid,
                                    cv=3,
                                    n_jobs=-1,
                                    verbose=1)
            # fit model to data
            knn_gscv.fit(dataset.train_x, dataset.train_y)
            print(
                f"Best parameters for this knn algorithm are {knn_gscv.best_params_}"
            )
            print(
                f"Best score for this knn algorithm are {knn_gscv.best_score_}"
            )

            # Cross validation of model
            # scores = cross_val_score(knn_gscv.best_estimator_, dataset.x, dataset.y, scoring='accuracy')
            # print(f"CV Scores mean GSV Search : {scores.mean()} ")
            knn_gscv.best_estimator_.predict(dataset.test_x)
            print(
                f"Knn GSCV Score {knn_gscv.best_estimator_.score(dataset.test_x, dataset.test_y)}"
            )

            ## Learning Curve
            learning = plots.plot_learning_curve(knn_gscv.best_estimator_,
                                                 "Learning Curves (KNN)",
                                                 dataset.train_x,
                                                 dataset.train_y,
                                                 cv=5,
                                                 n_jobs=4)

            learning.show(block=False)
            plt.show()

            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x,
                    dataset.y,
                    test_size=k,
                    random_state=dataset.randomness)
                start_time = time.time()

                knn_gscv.best_estimator_.fit(X_train, y_train)
                y_pred = knn_gscv.best_estimator_.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(
                f'KNN:Time Taken & Accuracy vs TestSet Sizes for {dataset.dataset_name}'
            )
            plt.plot(train_sizes,
                     time_taken,
                     label='Time Taken Vs TestData Size')
            plt.plot(train_sizes,
                     accuracy_scores,
                     label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(
                f"Average time taken for KNN algorithms is {np.mean(time_taken)}"
            )

        else:
            k_range = np.arange(6, 15)
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                knn = KNeighborsClassifier(n_neighbors=k,
                                           weights='uniform',
                                           metric='manhattan',
                                           n_jobs=4)
                knn.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y)

            # Visualization of k values vs accuracy

            plt.title('k-NN: Varying Number of Neighbors')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('Number of Neighbors')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()
            # create new a knn model
            knn2 = KNeighborsClassifier(n_jobs=4)
            # create a dictionary of all values we want to test for n_neighbors
            param_grid = {
                'n_neighbors': k_range,
                'weights': ['uniform', 'distance'],
                'metric': ['manhattan', 'euclidean']
            }
            # use gridsearch to test all values for n_neighbors
            knn_gscv = GridSearchCV(knn2,
                                    param_grid,
                                    cv=3,
                                    n_jobs=-1,
                                    verbose=1)
            # fit model to data
            knn_gscv.fit(dataset.train_x, dataset.train_y)
            print(
                f"Best parameters for this knn algorithm are {knn_gscv.best_params_}"
            )
            print(
                f"Best score for this knn algorithm are {knn_gscv.best_score_}"
            )

            # Cross validation of model
            scores = cross_val_score(knn_gscv.best_estimator_,
                                     dataset.x,
                                     dataset.y,
                                     scoring='accuracy')
            print(f"CV Scores mean GSV Search : {scores.mean()} ")
            knn_gscv.best_estimator_.predict(dataset.test_x)
            print(
                f"Knn GSCV Score {knn_gscv.best_estimator_.score(dataset.test_x, dataset.test_y)}"
            )

            ## Learning Curve
            learning = plots.plot_learning_curve(knn_gscv.best_estimator_,
                                                 "Learning Curves (KNN)",
                                                 dataset.train_x,
                                                 dataset.train_y,
                                                 cv=5,
                                                 n_jobs=4)

            learning.show(block=False)
            plt.show()

            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x,
                    dataset.y,
                    test_size=k,
                    random_state=dataset.randomness)
                start_time = time.time()

                knn_gscv.best_estimator_.fit(X_train, y_train)
                y_pred = knn_gscv.best_estimator_.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(
                f'KNN:Time Taken & Accuracy vs TestSet Sizes for {dataset.dataset_name}'
            )
            plt.plot(train_sizes,
                     time_taken,
                     label='Time Taken Vs TestData Size')
            plt.plot(train_sizes,
                     accuracy_scores,
                     label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(
                f"Average time taken for KNN algorithms is {np.mean(time_taken)}"
            )
Exemplo n.º 10
0
    def run_experiment(self, dataset):
        if dataset.dataset_name == 'Diabetes Data Set':
            # Decision Tree without Pruning
            self.learner_model = tree.DecisionTreeClassifier(
                random_state=dataset.randomness)
            self.learner_model.fit(dataset.train_x, dataset.train_y)
            y_test_pred = self.learner_model.predict(dataset.test_x)
            print(
                f'Test score for simple tree {accuracy_score(y_test_pred, dataset.test_y)}'
            )

            clf = self.learner_model
            # Decision Tree with Post-Pruning
            path = clf.cost_complexity_pruning_path(dataset.train_x,
                                                    dataset.train_y)
            ccp_alphas, impurities = path.ccp_alphas, path.impurities
            # print(ccp_alphas)
            clfs = []
            for ccp_alpha in ccp_alphas:
                clf = tree.DecisionTreeClassifier(
                    random_state=dataset.randomness, ccp_alpha=ccp_alpha)
                clf.fit(dataset.train_x, dataset.train_y)
                clfs.append(clf)

            clfs = clfs[:-1]
            train_acc = []
            test_acc = []
            for c in clfs:
                y_train_pred = c.predict(dataset.train_x)
                y_test_pred = c.predict(dataset.test_x)
                train_acc.append(accuracy_score(y_train_pred, dataset.train_y))
                test_acc.append(accuracy_score(y_test_pred, dataset.test_y))

            plt.title('Decision Trees: Varying CCP ALPHAS')
            plt.plot(ccp_alphas[:-1], test_acc, label='Testing Accuracy')
            plt.plot(ccp_alphas[:-1], train_acc, label='Training Accuracy')
            # plt.xticks(np.arange(min(ccp_alphas), max(ccp_alphas) + 1, 1.0))
            plt.legend()
            plt.xlabel('CCP Alpha Values')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()

            clf_ = tree.DecisionTreeClassifier(random_state=dataset.randomness,
                                               ccp_alpha=0.01)
            clf_.fit(dataset.train_x, dataset.train_y)
            y_test_pred = clf_.predict(dataset.test_x)

            print(
                f'Test score cost complexity pruning {accuracy_score(y_test_pred, dataset.test_y)}'
            )

            ###### WHOLE NEW TEST
            param_grid = {
                "criterion": ["gini", "entropy"],
                "min_samples_split": [6, 7, 8],
                "max_depth": [5, 10, 15, 18],
                "min_samples_leaf": [1, 2, 4],
                "max_leaf_nodes": [26, 28, 29, 30, 32],
            }

            dt = DecisionTreeClassifier()
            ts_gs = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5)
            ts_gs.fit(dataset.train_x, dataset.train_y)
            model = ts_gs.best_estimator_

            # test the returned best parameters
            print("\n\n-- Testing best parameters [Grid]...")
            print(ts_gs.best_params_)
            y_test_pred = model.predict(dataset.test_x)
            print(f'Test score {accuracy_score(y_test_pred, dataset.test_y)}')

            learning_curve = plots.plot_learning_curve(
                ts_gs.best_estimator_,
                "Learning Curves (Decision Trees)",
                dataset.x,
                dataset.y,
                cv=5,
                n_jobs=4)

            learning_curve.show()

            k_range = np.arange(15, 25)
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                knn = DecisionTreeClassifier(criterion='entropy',
                                             max_depth=k,
                                             min_samples_leaf=2,
                                             min_samples_split=6,
                                             max_leaf_nodes=28)
                knn.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y)

            # Visualization of k values vs accuracy

            print(
                f"Average accuracy for all algorithms {np.mean(test_accuracy)}"
            )

            plt.title('Decision Trees: Varying max_depth')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('max_depth')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()
            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x,
                    dataset.y,
                    test_size=k,
                    random_state=dataset.randomness)
                start_time = time.time()

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(
                f'Decision Trees: Time Taken & Accuracy vs TestData Sizes for {dataset.dataset_name}'
            )
            plt.plot(train_sizes,
                     time_taken,
                     label='Time Taken Vs TestData Size')
            plt.plot(train_sizes,
                     accuracy_scores,
                     label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(
                f"Average time taken for Decision Trees algorithms is {np.mean(time_taken)}"
            )
        else:
            # Decision Tree without Pruning
            self.learner_model = tree.DecisionTreeClassifier(
                random_state=dataset.randomness)
            self.learner_model.fit(dataset.train_x, dataset.train_y)
            y_test_pred = self.learner_model.predict(dataset.test_x)
            print(
                f'Test score for simple tree {accuracy_score(y_test_pred, dataset.test_y)}'
            )

            clf = self.learner_model
            # Decision Tree with Post-Pruning
            path = clf.cost_complexity_pruning_path(dataset.train_x,
                                                    dataset.train_y)
            ccp_alphas, impurities = path.ccp_alphas, path.impurities
            # print(ccp_alphas)
            clfs = []
            for ccp_alpha in ccp_alphas:
                clf = tree.DecisionTreeClassifier(
                    random_state=dataset.randomness, ccp_alpha=ccp_alpha)
                clf.fit(dataset.train_x, dataset.train_y)
                clfs.append(clf)

            clfs = clfs[:-1]
            train_acc = []
            test_acc = []
            for c in clfs:
                y_train_pred = c.predict(dataset.train_x)
                y_test_pred = c.predict(dataset.test_x)
                train_acc.append(accuracy_score(y_train_pred, dataset.train_y))
                test_acc.append(accuracy_score(y_test_pred, dataset.test_y))

            plt.title('Decision Trees: Varying CCP ALPHAS')
            plt.plot(ccp_alphas[:-1], test_acc, label='Testing Accuracy')
            plt.plot(ccp_alphas[:-1], train_acc, label='Training Accuracy')
            # plt.xticks(np.arange(min(ccp_alphas), max(ccp_alphas) + 1, 1.0))
            plt.legend()
            plt.xlabel('CCP Alpha Values')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()

            clf_ = tree.DecisionTreeClassifier(random_state=dataset.randomness,
                                               ccp_alpha=0.00234)
            clf_.fit(dataset.train_x, dataset.train_y)
            y_test_pred = clf_.predict(dataset.test_x)

            print(
                f'Test score cost complexity pruning {accuracy_score(y_test_pred, dataset.test_y)}'
            )

            ###### WHOLE NEW TEST
            param_grid = {
                "criterion": ["gini", "entropy"],
                "min_samples_split": [2, 3, 4],
                "max_depth": [5, 6, 7, 8, 10, 12, 14],
                "min_samples_leaf": [2, 3, 4, 6, 8],
                "max_leaf_nodes": [30, 32, 34, 36, 38, 40, 42, 44],
            }

            dt = DecisionTreeClassifier()
            ts_gs = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5)
            ts_gs.fit(dataset.train_x, dataset.train_y)
            model = ts_gs.best_estimator_

            # test the returned best parameters
            print("\n\n-- Testing best parameters [Grid]...")
            print(ts_gs.best_params_)
            y_test_pred = model.predict(dataset.test_x)
            print(f'Test score {accuracy_score(y_test_pred, dataset.test_y)}')

            learning_curve = plots.plot_learning_curve(
                ts_gs.best_estimator_,
                "Learning Curves (Decision Trees)",
                dataset.x,
                dataset.y,
                cv=5,
                n_jobs=4)

            learning_curve.show()

            k_range = np.arange(5, 15)
            train_accuracy = np.empty(len(k_range))
            test_accuracy = np.empty(len(k_range))
            for i, k in enumerate(k_range):
                knn = DecisionTreeClassifier(criterion='entropy',
                                             max_depth=k,
                                             min_samples_leaf=2,
                                             min_samples_split=6,
                                             max_leaf_nodes=28)
                knn.fit(dataset.train_x, dataset.train_y)
                # Compute accuracy on the training set
                train_accuracy[i] = knn.score(dataset.train_x, dataset.train_y)

                # Compute accuracy on the testing set
                test_accuracy[i] = knn.score(dataset.test_x, dataset.test_y)

            # Visualization of k values vs accuracy

            print(
                f"Average accuracy for all algorithms {np.mean(test_accuracy)}"
            )

            plt.title('Decision Trees: Varying max_depth')
            plt.plot(k_range, test_accuracy, label='Testing Accuracy')
            plt.plot(k_range, train_accuracy, label='Training Accuracy')
            plt.xticks(np.arange(min(k_range), max(k_range) + 1, 1.0))
            plt.legend()
            plt.xlabel('max_depth')
            plt.ylabel('Accuracy')
            plt.show(block=False)
            plt.show()

            ## PLOT TIMINGS
            # Plot time taken for various sizes of database
            train_sizes = np.linspace(0.1, 0.9, 5)
            time_taken = np.empty(len(train_sizes))
            accuracy_scores = np.empty(len(train_sizes))
            for i, k in enumerate(train_sizes):
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset.x,
                    dataset.y,
                    test_size=k,
                    random_state=dataset.randomness)
                start_time = time.time()

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                elapsed_time = time.time() - start_time
                time_taken[i] = elapsed_time
                accuracy_scores[i] = accuracy_score(y_pred, y_test)

            ## Plot Times taken by different models
            plt.title(
                f'Decision Trees:Time Taken & Accuracy vs TestData Size for {dataset.dataset_name}'
            )
            plt.plot(train_sizes,
                     time_taken,
                     label='Time Taken Vs TestData Size')
            plt.plot(train_sizes,
                     accuracy_scores,
                     label='Accuracy Vs TestData Size')
            plt.legend()
            plt.xlabel('TestData Size')
            plt.ylabel('Time Taken')
            plt.show(block=False)
            plt.show()

            print(
                f"Average time taken for SVM algorithms is {np.mean(time_taken)}"
            )
Exemplo n.º 11
0
    def run_experiment(self, dataset):
        # Fit regression model
        if dataset.dataset_name == 'Diabetes Data Set':
            regr_1 = tree.DecisionTreeClassifier(random_state=dataset.randomness,
                                                 ccp_alpha=0.01
                                                 )
            regr_2 = AdaBoostClassifier(regr_1)

            param_grid = {
                "base_estimator__splitter": ["best", "random"],
                "n_estimators": [50, 100, 1, 2, 10, 20, 30, 40]
            }

            regr_1.fit(dataset.train_x, dataset.train_y)
            regr_2.fit(dataset.train_x, dataset.train_y)

            # Predict
            y_1 = regr_1.predict(dataset.test_x)
            y_2 = regr_2.predict(dataset.test_x)

            # Plot the results
            print(f"Accuracy of the model regr_1 is {accuracy_score(y_1, dataset.test_y)}")
            print("Classification Report")
            print(classification_report(y_1, dataset.test_y))

            print(f"Accuracy of the model regr_2 is {accuracy_score(y_2, dataset.test_y)}")
            print("Classification Report")
            print(classification_report(y_2, dataset.test_y))

            # evaluate the model
            cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
            n_scores = cross_val_score(regr_2, dataset.x, dataset.y, scoring='accuracy', cv=cv, n_jobs=-1,
                                       error_score='raise')

            print(f"Cross Validation Score is {np.mean(n_scores)}")

            # Grid Search
            # run grid search
            grid_search = GridSearchCV(regr_2, param_grid=param_grid, scoring='accuracy', cv=5)
            # execute the grid search
            grid_result = grid_search.fit(dataset.train_x, dataset.train_y)
            # summarize the best score and configuration
            print(f"Best parameters are {grid_result.best_params_}")

            predictions = grid_result.best_estimator_.predict(dataset.test_x)
            print(f"Accuracy of the model is {accuracy_score(predictions, dataset.test_y)}")
            print("Classification Report of Model")
            print(classification_report(predictions, dataset.test_y))

            learning_curve = plots.plot_learning_curve(grid_result.best_estimator_, "Learning Curves (Decision Trees)",
                                                       dataset.x, dataset.y,
                                                       cv=5, n_jobs=4)

            learning_curve.show(block=False)
        else:
            regr_1 = tree.DecisionTreeClassifier(random_state=dataset.randomness,
                                                 ccp_alpha=0.00234
                                                 )
            regr_2 = AdaBoostClassifier(regr_1)

            param_grid = {
                "base_estimator__splitter": ["best", "random"],
                "n_estimators": [50, 100,150,200]
            }

            regr_1.fit(dataset.train_x, dataset.train_y)
            regr_2.fit(dataset.train_x, dataset.train_y)

            # Predict
            y_1 = regr_1.predict(dataset.test_x)
            y_2 = regr_2.predict(dataset.test_x)

            # Plot the results
            print(f"Accuracy of the model regr_1 is {accuracy_score(y_1, dataset.test_y)}")
            print("Classification Report")
            print(classification_report(y_1, dataset.test_y))

            print(f"Accuracy of the model regr_2 is {accuracy_score(y_2, dataset.test_y)}")
            print("Classification Report")
            print(classification_report(y_2, dataset.test_y))

            # evaluate the model
            cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
            n_scores = cross_val_score(regr_2, dataset.x, dataset.y, scoring='accuracy', cv=cv, n_jobs=-1,
                                       error_score='raise')

            print(f"Cross Validation Score is {np.mean(n_scores)}")

            # Grid Search
            # run grid search
            grid_search = GridSearchCV(regr_2, param_grid=param_grid, scoring='accuracy', cv=5)
            # execute the grid search
            grid_result = grid_search.fit(dataset.train_x, dataset.train_y)
            # summarize the best score and configuration
            print(f"Best parameters are {grid_result.best_params_}")

            predictions = grid_result.best_estimator_.predict(dataset.test_x)
            print(f"Accuracy of the model is {accuracy_score(predictions, dataset.test_y)}")
            print("Classification Report of Model")
            print(classification_report(predictions, dataset.test_y))

            learning_curve = plots.plot_learning_curve(grid_result.best_estimator_, "Learning Curves (Decision Trees)",
                                                       dataset.x, dataset.y,
                                                       cv=5, n_jobs=4)

            learning_curve.show(block=False)