예제 #1
0
def mnist_digit_recognition():
    train_set, test_set = load_mnist_dataset()
    n_labels = 10  # 1,2,3,4,5,6,7,9,0
    n_features = 28 * 28

    draw_ex_images(5, 4, train_set[0].shape[0], train_set[0])

    mnist_model = GaussianNaiveBayes(n_labels, n_features)
    start = time.time()
    mnist_model.train(train_set[0], train_set[1])
    end = time.time()
    print(end - start)
    mnist_model.save_model()
    mean, var, pi = mnist_model.get_parameters()
    print(f"Model parameters: mean {mean}, var {var}, pi {pi}")

    test_data, labels = test_set
    limit = 150
    test_data, labels = test_data[:limit], labels[:limit]
    results = np.arange(limit, dtype=np.int)
    for n in range(limit):
        results[n] = mnist_model.classify(test_data[n])
        print(f"{n} : predicted {results[n]}, correct {labels[n]}")

    print("recognition rate: ", (results == labels).mean())
예제 #2
0
def test_bayes():

    iris = load_iris()
    #dataframe input required
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                      columns=iris['feature_names'] + ['target'])
    X, y = df.drop('target', 1), df[['target']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=1, stratify=y, shuffle=True)

    clas = GaussianNaiveBayes()
    clas.fit(X_train, y_train)

    pred2 = clas.predict(X_test)
    print("F1: ", f1_score(pred2, y_test, average='micro'))
예제 #3
0
def figure_1(train_set_results, X_test, y_test):
    """Learning curves best SVMvs GNB"""
    best_SVM_params = max(
        [(statistics.mean(group_results['test_score']),
          group_results['estimator'][0].best_params_)
         for group_results in train_set_results['tf']['plain'].values()],
        key=lambda x: x[0])
    best_SVM = Pipeline([("estimator", SVC(max_iter=1000000))])
    best_SVM.set_params(**best_SVM_params[1])
    plot_learning_curves_macro(best_SVM, GaussianNaiveBayes(), "SVM", "GNB",
                               "SVM vs GNB learning curves", X_test, y_test)
예제 #4
0
def naive_bayes_test():
    n_labels, n_features = 2, 2
    nb = GaussianNaiveBayes(n_labels, n_features)

    # prepare sample training data
    data1 = np.random.multivariate_normal([1,4], [[2,0],[0,2]], size=100)
    data2 = np.random.multivariate_normal([5,7], [[3,0],[0,1]], size=100)
    data = np.concatenate((data1, data2), axis=0)

    # prepare training label data
    labels = np.concatenate((np.array([0]*100), np.array([1]*100)), axis=0)
    print "correct labels"
    print labels

    # nb.load()
    nb.train(data, labels)
    # nb.save()

    results = nb.classify(data)
    print "predicted labels"
    print results
    
    print "recognition rate: ", (results == labels).mean()
예제 #5
0
def mnist_digit_recognition():
    train_set, valid_set, test_set = load_mnist_dataset("mnist.pkl.gz")
    n_labels = 10 # 1,2,3,4,5,6,7,9,0
    n_features = 28*28
    mnist_model = GaussianNaiveBayes(n_labels, n_features)
    mnist_model.train(train_set[0], train_set[1])
    [mean, var], pi = mnist_model.get_parameters()

    # visualization of learned means
    create_2D_images_horizontal(mean, w=28, h=28)
    show()

    test_data, labels = test_set
    # slice
    #limit = len(test_data)
    limit = 50
    test_data, labels = test_data[:limit], labels[:limit]
    results = np.arange(limit, dtype=np.int)
    for n in range(limit):
        results[n] = mnist_model.classify(test_data[n])
        print "%d : predicted %s, correct %s" % (n, results[n], labels[n])
    # results = mnist_model.classify(test_data)

    print "recognition rate: ", (results == labels).mean()
def mnist_digit_recognition():
    train_set, valid_set, test_set = load_mnist_dataset("mnist.pkl.gz")
    n_labels = 10  # 1,2,3,4,5,6,7,9,0
    n_features = 28 * 28
    mnist_model = GaussianNaiveBayes(n_labels, n_features)
    mnist_model.train(train_set[0], train_set[1])
    [mean, var], pi = mnist_model.get_parameters()

    # visualization of learned means
    create_2D_images_horizontal(mean, w=28, h=28)
    show()

    test_data, labels = test_set
    # slice
    #limit = len(test_data)
    limit = 50
    test_data, labels = test_data[:limit], labels[:limit]
    results = np.arange(limit, dtype=np.int)
    for n in range(limit):
        results[n] = mnist_model.classify(test_data[n])
        print "%d : predicted %s, correct %s" % (n, results[n], labels[n])
    # results = mnist_model.classify(test_data)

    print "recognition rate: ", (results == labels).mean()
예제 #7
0
def main():
    """
    Loads the dataset and either computes or loads precomputed results and using them (or not) completes task
    designated by the task variable
    :return: Happiness
    """
    from pathlib import Path
    class_names = ["ham", "spam"]

    X, y = load_set()
    # X, _, y, _ = train_test_split(X,y,train_size=0.12) <- use to make dataset smaller for testing

    recompute = False

    pickled_train_set_results = "gridsearch.pickle"

    # load the precomputed results file if exists
    if Path(pickled_train_set_results).is_file() and not recompute:
        with open(pickled_train_set_results, 'rb') as file:
            results = pickle.load(file)
    else:
        # compute the test results (may take a long time)
        pipelines = {
            "plain":
            Pipeline([("estimator", SVC(max_iter=1000000))]),
            "normalized":
            Pipeline([("preprocessing", Normalizer()),
                      ("estimator", SVC(max_iter=1000000))]),
            "scaled":
            Pipeline([("preprocessing", MaxAbsScaler()),
                      ("estimator", SVC(max_iter=1000000))]),
        }

        datasets = {"tf": X, "tf_idf": transform_tf_to_tf_idf(X)}
        Cs = [10.0**x for x in range(-2, 5)]
        gammas = [10.0**x for x in range(-8, 1)] + ["scale"]
        param_grid = {
            "svm-rbf": {
                'estimator__kernel': ['rbf'],
                'estimator__gamma': gammas,
                'estimator__C': [10.0**x for x in range(-1, 10)]
            },
            "svm-linear": {
                'estimator__kernel': ['linear'],
                'estimator__C': Cs
            },
            "svm-poly-d2-c00": {
                'estimator__kernel': ['poly'],
                'estimator__C': Cs,
                'estimator__degree': [2]
            },
            "svm-poly-d2-c01": {
                'estimator__kernel': ['poly'],
                'estimator__C': Cs,
                'estimator__degree': [2],
                'estimator__coef0': [1]
            },
            "svm-poly-d3-c00": {
                'estimator__kernel': ['poly'],
                'estimator__C': Cs,
                'estimator__degree': [3]
            },
            "svm-poly-d3-c01": {
                'estimator__kernel': ['poly'],
                'estimator__C': Cs,
                'estimator__degree': [3],
                'estimator__coef0': [1]
            },
            "gnb-sk": {
                'estimator': [GaussianNB()]
            },
            "gnb-my": {
                'estimator': [GaussianNaiveBayes()]
            },
            "multinomialnb": {
                'estimator': [MultinomialNB()]
            }
        }

        results = {
            dataset: {
                pipeline: {group: {}
                           for group in param_grid.keys()}
                for pipeline in pipelines.keys()
            }
            for dataset in datasets.keys()
        }
        param_search_cv = StratifiedKFold(n_splits=5,
                                          shuffle=True,
                                          random_state=42)
        eval_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        progress = np.zeros(3, dtype=np.uint16)
        for dataset_name, dataset in datasets.items():
            progress = progress * np.array(
                [1, 0, 0], dtype=np.uint16) + np.array([1, 0, 0],
                                                       dtype=np.uint16)
            print(f"Processing dataset {progress[0]} of {len(datasets)}")
            for pipeline_name, pipeline in pipelines.items():
                progress = progress * np.array(
                    [1, 1, 0], dtype=np.uint16) + np.array([0, 1, 0],
                                                           dtype=np.uint16)
                print(f"Processing pipeline {progress[1]} of {len(pipelines)}")
                for group_name, group in param_grid.items():
                    progress = progress * np.array(
                        [1, 1, 1], dtype=np.uint16) + np.array([0, 0, 1],
                                                               dtype=np.uint16)
                    print(
                        f"Processing group {progress[2]} of {len(param_grid)}")
                    param_search = GridSearchCV(pipeline,
                                                group,
                                                n_jobs=-1,
                                                cv=param_search_cv,
                                                verbose=1)
                    cv_results = cross_validate(param_search,
                                                dataset,
                                                y,
                                                cv=eval_cv,
                                                return_estimator=True,
                                                verbose=1)

                    results[dataset_name][pipeline_name][
                        group_name] = cv_results
        # save the results for future reuse
        with open(pickled_train_set_results, 'wb') as file:
            pickle.dump(results, file)

    # select your task
    task = "table_7"  # <-----here

    if task == "table_1":
        table_1(results)
    elif task == "table_2":
        table_2(results)
    elif task == "table_3":
        table_3(results)
    elif task == "table_4":
        table_4(results)
    elif task == "table_5":
        table_5(results)
    elif task == "table_6":
        table_6(results)
    elif task == "table_7":
        table_7(results)
    elif task == "table_8":
        table_8(results)
    elif task == "figure_1":
        figure_1(results, X, y)
    elif task == "figure_2":
        figure_2(results, X, y)
    elif task == "figure_3":
        figure_3(results, X, y, class_names)
    elif task == "figure_4":
        figure_4(X, y, class_names)
    elif task == "figure_5":
        figure_5(X, y, class_names)
    elif task == "experiment":
        experiment(X, y)
    elif task == "histograms":
        plot_feature_histogram(minmax_scale(X), y, feature=7)
    else:
        print(f"Unknown task; {task}")