Пример #1
0
def main():
    df = create_dataset()
    X, y = feature_target_split(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    '''
    I would use cv_num = 5 or higher, but it will take too long time to run on local
    so I used cv_num=2
    '''
    rf_best = run_models(X_train, y_train, cv_num=2)

    with open('picklefiles/rf.pkl', 'wb') as output:
        pickle.dump(rf_best, output, pickle.HIGHEST_PROTOCOL)

    print rf_best.__class__.__name__
    quality(X_test, y_test, rf_best)
    feature_importance(X_test, y_test, gd_best, df)
def main():
    df = create_dataset()
    X, y = feature_target_split(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    rf_best, gd_best, ada_best = run_models(X_train, y_train, cv_num=5)

    with open('picklefiles/rf.pkl', 'wb') as output:
        pickle.dump(rf_best, output, pickle.HIGHEST_PROTOCOL)
    with open('picklefiles/gd.pkl', 'wb') as output:
        pickle.dump(X, output, pickle.HIGHEST_PROTOCOL)
    with open('picklefiles/ada.pkl', 'wb') as output:
        pickle.dump(ada_best, output, pickle.HIGHEST_PROTOCOL)

    print rf_best.__class__.__name__
    quality(X_test, y_test, rf_best)
    feature_importance(X_test, y_test, gd_best, df)

    print gd_best.__class__.__name__
    quality(X_test, y_test, gd_best)
    feature_importance(X_test, y_test, rf_best, df)

    print ada_best.__class__.__name__
    quality(X_test, y_test, ada_best)
    feature_importance(X_test, y_test, ada_best, df)
    print rf_best.__class__.__name__
    quality(X_test, y_test, rf_best)
    feature_importance(X_test, y_test, gd_best, df)

    print gd_best.__class__.__name__
    quality(X_test, y_test, gd_best)
    feature_importance(X_test, y_test, rf_best, df)

    print ada_best.__class__.__name__
    quality(X_test, y_test, ada_best)
    feature_importance(X_test, y_test, ada_best, df)


if __name__ == '__main__':
    df = create_dataset()
    X, y = feature_target_split(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    #Random Forest
    rf = RandomForestClassifier()
    cross_val(rf, train_x, train_y, num_folds=5)
    print rf.__class__.__name__
    quality(X_test, y_test, rf)
    feature_importance(X_test, y_test, rf, df)

    # #Gradient Boosting
    # gd = GradientBoostingClassifier()
    # cross_val(gd, train_x, train_y, num_folds=5)
    # print gd.__class__.__name__
    # quality(X_test, y_test, gd)
def create_not_change_angle_data():
    create_data.create_dataset("train", 1000, is_change_brightness=True, is_change_angle=True)
    create_data.create_dataset("valid", 10, is_change_brightness=True, is_change_angle=True)
Пример #5
0
def main(train_spectrum_path=r"dataset/train_spectrum.npy",
         test_spectrum_path=r"dataset/test_spectrum.npy",
         train_labels_path=r"dataset/train_labels.npy",
         test_labels_path=r"dataset/test_labels.npy",
         batch_size=1,
         learning_rate=0.01,
         num_epochs=20,
         kernel_size=(1, 2),
         padding=(0, 0),
         dropout=True,
         drop_prob=0.2,
         batch_normalization=True,
         weight_decay=True,
         weight_decay_amount=0.01,
         data_width=2100,
         model_save_path=r"model.pth",
         fruits=("apple", "banana", "mix"),
         create_dataset_now=False,
         root_dir="YOMIRAN",
         num_channels_layer1=3,
         num_channels_layer2=6,
         sample_time="after 5",
         sample_location="anal",
         sample_type="pos",
         tolerance=5,
         number_of_samples_to_alter=100,
         size_of_dataset=60000,
         train_data_percentage=0.8,
         train_now=False,
         show_statistics=True,
         predict_now=False,
         file_to_predict=r"apple neg.txt",
         confidence_threshold=0.7,
         validate_hierarchy=True,
         validate_filename_format=True,
         validate_empty_file=True,
         create_dataset_progress_bar_intvar=None,
         fc1_amount_output_nodes=1000,
         fc2_amount_output_nodes=500,
         fc3_amount_output_node=100,
         stretch_data=False,
         knn=False,
         cross_validation_iterations=1,
         n_components=2,
         k="auto"):

    # create data set
    if create_dataset_now:
        valid_files, _ = get_valid_and_invalid_files(
            root_dir=root_dir,
            validate_empty_file=validate_empty_file,
            validate_filename_format=validate_filename_format,
            validate_hierarchy=validate_hierarchy)
        create_dataset(data_files=valid_files,
                       fruits=fruits,
                       size_of_dataset=size_of_dataset,
                       train_data_percentage=train_data_percentage,
                       tolerance=tolerance,
                       number_of_samples_to_alter=number_of_samples_to_alter,
                       train_spectrum_path=Path(train_spectrum_path),
                       train_labels_path=Path(train_labels_path),
                       test_spectrum_path=Path(test_spectrum_path),
                       test_labels_path=Path(test_labels_path),
                       data_width=data_width,
                       sample_time=sample_time,
                       sample_location=sample_location,
                       create_dataset_progress_bar_intvar=
                       create_dataset_progress_bar_intvar,
                       stretch_data=stretch_data,
                       sample_type=sample_type,
                       n_components=n_components)

    # transformation of dataset
    transform = compose(transforms.ToTensor(), minmax_scale)
    # get the labels enum
    if fruits:
        fruit_label_enum = create_fruit_labels(fruits=fruits)
    # transform = transforms.ToTensor()

    if train_now:
        # Get the dataset
        train_data_loader = DataLoader("train",
                                       train_spectrum_path=train_spectrum_path,
                                       train_labels_path=train_labels_path,
                                       batch_size=batch_size,
                                       transform=transform)
        test_data_loader = DataLoader("test",
                                      test_spectrum_path=test_spectrum_path,
                                      test_labels_path=test_labels_path,
                                      batch_size=batch_size,
                                      transform=transform)

        if knn:
            train_data_loader_size_calculator = copy.deepcopy(
                train_data_loader)
            amount_train_data = 0
            for spectrum, labels in train_data_loader_size_calculator.load_data(
            ):
                amount_train_data += spectrum.shape[0]

            if k == "auto":
                k = math.ceil(math.sqrt(amount_train_data))

            cross_validation_accuracies = []
            cross_validation_true_labels = []
            cross_validation_predictions = []
            for i in range(cross_validation_iterations):
                print("Cross validation iteration: {}/{}".format(
                    i + 1, cross_validation_iterations))
                # Get the dataset
                train_data_loader = DataLoader(
                    "train",
                    train_spectrum_path=train_spectrum_path,
                    train_labels_path=train_labels_path,
                    batch_size=1,
                    transform=transform)
                test_data_loader = DataLoader(
                    "test",
                    test_spectrum_path=test_spectrum_path,
                    test_labels_path=test_labels_path,
                    batch_size=1,
                    transform=transform)
                model = KNN(k=k,
                            train_data_loader=train_data_loader,
                            test_data_loader=test_data_loader)
                accuracy, true_labels, predictions = model.train()
                cross_validation_accuracies.append(accuracy * 100)
                cross_validation_true_labels.extend(true_labels)
                cross_validation_predictions.extend(predictions)
                print("k={}\tAccuracy: {:.3f}%".format(k, accuracy * 100))

                shuffle_data_for_cross_validation(
                    train_data_loader=train_data_loader,
                    test_data_loader=test_data_loader,
                    train_spectrum_path=train_spectrum_path,
                    train_labels_path=train_labels_path,
                    test_spectrum_path=test_spectrum_path,
                    test_labels_path=test_labels_path)

            accuracies_mean = stat.mean(cross_validation_accuracies)
            accuracies_std = stat.stdev(cross_validation_accuracies)
            print("Test accuracies mean: {}".format(accuracies_mean))
            print("Test accuracies standard deviation: {}".format(
                accuracies_std))
            plot_data.plot_box_plot(
                test_accuracies=cross_validation_accuracies)
            plot_data.plot_confusion_matrix(
                true_labels=cross_validation_true_labels,
                predictions=cross_validation_predictions,
                fruits=fruits,
                show_null_values=True,
                show_plot=True)
            # for k in range(1, amount_train_data):
            #     model = KNN(k=k, train_data_loader=train_data_loader, test_data_loader=test_data_loader)
            #     accuracy = model.train()
            #     print("k={}\tAccuracy: {:.3f}%".format(k, accuracy * 100))

        else:
            train_data_loader_size_calculator = copy.deepcopy(
                train_data_loader)
            amount_train_data = 0
            fruits_from_dataset = []
            for spectrum, labels in train_data_loader_size_calculator.load_data(
            ):
                amount_train_data += spectrum.shape[0]

                # deprecated
                if fruits is None:
                    for label in labels:
                        if label not in fruits_from_dataset:
                            fruits_from_dataset.append(label)

            # deprecated
            if fruits is None:
                fruit_label_enum = create_fruit_labels(
                    fruits=fruits_from_dataset)
                fruits = fruits_from_dataset

            cross_validation_losses = []
            cross_validation_accuracies_train = []
            cross_validation_accuracies_test = []
            cross_validation_true_labels = []
            cross_validation_predictions_of_last_epoch = []
            statistics_of_all_iterations = []
            for i in range(cross_validation_iterations):
                print("Cross validation iteration: {}/{}".format(
                    i + 1, cross_validation_iterations))
                # Get the dataset
                train_data_loader = DataLoader(
                    "train",
                    train_spectrum_path=train_spectrum_path,
                    train_labels_path=train_labels_path,
                    batch_size=batch_size,
                    transform=transform)
                test_data_loader = DataLoader(
                    "test",
                    test_spectrum_path=test_spectrum_path,
                    test_labels_path=test_labels_path,
                    batch_size=batch_size,
                    transform=transform)

                # initialize the neural net
                model = CNN(amount_of_labels=len(fruit_label_enum),
                            batch_normalization=batch_normalization,
                            dropout=dropout,
                            drop_prob=drop_prob,
                            kernel_size=kernel_size,
                            padding=padding,
                            data_width=data_width,
                            data_height=1,
                            num_channels_layer1=num_channels_layer1,
                            num_channels_layer2=num_channels_layer2,
                            fc1_amount_output_nodes=fc1_amount_output_nodes,
                            fc2_amount_output_nodes=fc2_amount_output_nodes,
                            fc3_amount_output_node=fc3_amount_output_node,
                            n_components=n_components)

                # train the model
                statistics = train_model(
                    model=model,
                    fruit_label_enum=fruit_label_enum,
                    train_data_loader=train_data_loader,
                    test_data_loader=test_data_loader,
                    num_epochs=num_epochs,
                    learning_rate=learning_rate,
                    batch_size=batch_size,
                    weight_decay=weight_decay,
                    weight_decay_amount=weight_decay_amount,
                    model_save_path=model_save_path,
                    train_dataset_size=amount_train_data)

                statistics_of_all_iterations.append(statistics)
                losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics
                cross_validation_losses.extend([losses[-1]])
                cross_validation_accuracies_train.extend(
                    [accuracies_train[-1]])
                cross_validation_accuracies_test.extend([accuracies_test[-1]])
                cross_validation_true_labels.extend(true_labels),
                cross_validation_predictions_of_last_epoch.extend(
                    list(predictions_of_last_epoch))

                shuffle_data_for_cross_validation(
                    train_data_loader=train_data_loader,
                    test_data_loader=test_data_loader,
                    train_spectrum_path=train_spectrum_path,
                    train_labels_path=train_labels_path,
                    test_spectrum_path=test_spectrum_path,
                    test_labels_path=test_labels_path)

            accuracies_test_mean = stat.mean(cross_validation_accuracies_test)
            accuracies_test_std = stat.stdev(cross_validation_accuracies_test)
            print("Test accuracies mean: {}".format(accuracies_test_mean))
            print("Test accuracies standard deviation: {}".format(
                accuracies_test_std))
            # plot_data.plot_box_plot(test_accuracies=cross_validation_accuracies_test, show_plot=True)

            # plot the statistics
            if show_statistics:
                # plot_data.plot_train_statistics(x_values=range(len(losses)), y_values=losses, x_label="Epoch",
                #                                 y_label="Loss")
                # plot_data.plot_train_statistics(x_values=range(len(accuracies_train)), y_values=accuracies_train,
                #                                 x_label="Epoch", y_label="Train accuracy")
                # plot_data.plot_train_statistics(x_values=range(len(accuracies_test)), y_values=accuracies_test,
                #                                 x_label="Epoch", y_label="Test accuracy")

                # max_test_accuracy = max(cross_validation_accuracies_test)
                # statistics = statistics_of_all_iterations[cross_validation_accuracies_test.index(max_test_accuracy)]
                # losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics
                # plot_data.plot_train_statistics1(losses=losses, train_accuracy=accuracies_train,
                #                                  test_accuracy=accuracies_test)

                plot_data.plot_box_plot(
                    test_accuracies=cross_validation_accuracies_test)
                plot_data.plot_confusion_matrix(
                    true_labels=cross_validation_true_labels,
                    predictions=cross_validation_predictions_of_last_epoch,
                    fruits=fruits,
                    show_null_values=True)
                plot_data.plot_classification_report(
                    true_labels=cross_validation_true_labels,
                    predictions=cross_validation_predictions_of_last_epoch,
                    show_plot=True)

                max_test_accuracy = max(cross_validation_accuracies_test)
                statistics = statistics_of_all_iterations[
                    cross_validation_accuracies_test.index(max_test_accuracy)]
                losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics
                plot_data.plot_train_statistics1(
                    losses=losses,
                    train_accuracy=accuracies_train,
                    test_accuracy=accuracies_test,
                    show_plot=True)

    if predict_now:

        # fit the pca
        valid_files, _ = get_valid_and_invalid_files(
            root_dir=root_dir,
            validate_empty_file=validate_empty_file,
            validate_filename_format=validate_filename_format,
            validate_hierarchy=validate_hierarchy)

        # Get existing data
        existing_data, _ = get_existing_data(fruits=fruits,
                                             data_files=valid_files,
                                             sample_time=sample_time,
                                             sample_location=sample_location,
                                             data_width=data_width,
                                             sample_type=sample_type)

        # fit pca
        pca = PCA(n_components=n_components)
        pca = pca.fit(existing_data)

        model = load_model(model_save_path,
                           amount_of_labels=len(fruit_label_enum),
                           batch_normalization=batch_normalization,
                           dropout=dropout,
                           drop_prob=drop_prob,
                           kernel_size=kernel_size,
                           padding=padding,
                           data_width=data_width,
                           data_height=1,
                           num_channels_layer1=num_channels_layer1,
                           num_channels_layer2=num_channels_layer2,
                           fc1_amount_output_nodes=fc1_amount_output_nodes,
                           fc2_amount_output_nodes=fc2_amount_output_nodes,
                           fc3_amount_output_node=fc3_amount_output_node,
                           n_components=n_components)

        confidence, prediction = predict(
            model=model,
            data_file=file_to_predict,
            pca=pca,
            transform=transform,
            fruit_label_enum=fruit_label_enum,
            data_width=data_width,
            confidence_threshold=confidence_threshold)

        return confidence, prediction