def main(): df = create_dataset() X, y = feature_target_split(df) X_train, X_test, y_train, y_test = train_test_split(X, y) ''' I would use cv_num = 5 or higher, but it will take too long time to run on local so I used cv_num=2 ''' rf_best = run_models(X_train, y_train, cv_num=2) with open('picklefiles/rf.pkl', 'wb') as output: pickle.dump(rf_best, output, pickle.HIGHEST_PROTOCOL) print rf_best.__class__.__name__ quality(X_test, y_test, rf_best) feature_importance(X_test, y_test, gd_best, df)
def main(): df = create_dataset() X, y = feature_target_split(df) X_train, X_test, y_train, y_test = train_test_split(X, y) rf_best, gd_best, ada_best = run_models(X_train, y_train, cv_num=5) with open('picklefiles/rf.pkl', 'wb') as output: pickle.dump(rf_best, output, pickle.HIGHEST_PROTOCOL) with open('picklefiles/gd.pkl', 'wb') as output: pickle.dump(X, output, pickle.HIGHEST_PROTOCOL) with open('picklefiles/ada.pkl', 'wb') as output: pickle.dump(ada_best, output, pickle.HIGHEST_PROTOCOL) print rf_best.__class__.__name__ quality(X_test, y_test, rf_best) feature_importance(X_test, y_test, gd_best, df) print gd_best.__class__.__name__ quality(X_test, y_test, gd_best) feature_importance(X_test, y_test, rf_best, df) print ada_best.__class__.__name__ quality(X_test, y_test, ada_best) feature_importance(X_test, y_test, ada_best, df)
print rf_best.__class__.__name__ quality(X_test, y_test, rf_best) feature_importance(X_test, y_test, gd_best, df) print gd_best.__class__.__name__ quality(X_test, y_test, gd_best) feature_importance(X_test, y_test, rf_best, df) print ada_best.__class__.__name__ quality(X_test, y_test, ada_best) feature_importance(X_test, y_test, ada_best, df) if __name__ == '__main__': df = create_dataset() X, y = feature_target_split(df) X_train, X_test, y_train, y_test = train_test_split(X, y) #Random Forest rf = RandomForestClassifier() cross_val(rf, train_x, train_y, num_folds=5) print rf.__class__.__name__ quality(X_test, y_test, rf) feature_importance(X_test, y_test, rf, df) # #Gradient Boosting # gd = GradientBoostingClassifier() # cross_val(gd, train_x, train_y, num_folds=5) # print gd.__class__.__name__ # quality(X_test, y_test, gd)
def create_not_change_angle_data(): create_data.create_dataset("train", 1000, is_change_brightness=True, is_change_angle=True) create_data.create_dataset("valid", 10, is_change_brightness=True, is_change_angle=True)
def main(train_spectrum_path=r"dataset/train_spectrum.npy", test_spectrum_path=r"dataset/test_spectrum.npy", train_labels_path=r"dataset/train_labels.npy", test_labels_path=r"dataset/test_labels.npy", batch_size=1, learning_rate=0.01, num_epochs=20, kernel_size=(1, 2), padding=(0, 0), dropout=True, drop_prob=0.2, batch_normalization=True, weight_decay=True, weight_decay_amount=0.01, data_width=2100, model_save_path=r"model.pth", fruits=("apple", "banana", "mix"), create_dataset_now=False, root_dir="YOMIRAN", num_channels_layer1=3, num_channels_layer2=6, sample_time="after 5", sample_location="anal", sample_type="pos", tolerance=5, number_of_samples_to_alter=100, size_of_dataset=60000, train_data_percentage=0.8, train_now=False, show_statistics=True, predict_now=False, file_to_predict=r"apple neg.txt", confidence_threshold=0.7, validate_hierarchy=True, validate_filename_format=True, validate_empty_file=True, create_dataset_progress_bar_intvar=None, fc1_amount_output_nodes=1000, fc2_amount_output_nodes=500, fc3_amount_output_node=100, stretch_data=False, knn=False, cross_validation_iterations=1, n_components=2, k="auto"): # create data set if create_dataset_now: valid_files, _ = get_valid_and_invalid_files( root_dir=root_dir, validate_empty_file=validate_empty_file, validate_filename_format=validate_filename_format, validate_hierarchy=validate_hierarchy) create_dataset(data_files=valid_files, fruits=fruits, size_of_dataset=size_of_dataset, train_data_percentage=train_data_percentage, tolerance=tolerance, number_of_samples_to_alter=number_of_samples_to_alter, train_spectrum_path=Path(train_spectrum_path), train_labels_path=Path(train_labels_path), test_spectrum_path=Path(test_spectrum_path), test_labels_path=Path(test_labels_path), data_width=data_width, sample_time=sample_time, sample_location=sample_location, create_dataset_progress_bar_intvar= create_dataset_progress_bar_intvar, stretch_data=stretch_data, sample_type=sample_type, n_components=n_components) # transformation of dataset transform = compose(transforms.ToTensor(), minmax_scale) # get the labels enum if fruits: fruit_label_enum = create_fruit_labels(fruits=fruits) # transform = transforms.ToTensor() if train_now: # Get the dataset train_data_loader = DataLoader("train", train_spectrum_path=train_spectrum_path, train_labels_path=train_labels_path, batch_size=batch_size, transform=transform) test_data_loader = DataLoader("test", test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path, batch_size=batch_size, transform=transform) if knn: train_data_loader_size_calculator = copy.deepcopy( train_data_loader) amount_train_data = 0 for spectrum, labels in train_data_loader_size_calculator.load_data( ): amount_train_data += spectrum.shape[0] if k == "auto": k = math.ceil(math.sqrt(amount_train_data)) cross_validation_accuracies = [] cross_validation_true_labels = [] cross_validation_predictions = [] for i in range(cross_validation_iterations): print("Cross validation iteration: {}/{}".format( i + 1, cross_validation_iterations)) # Get the dataset train_data_loader = DataLoader( "train", train_spectrum_path=train_spectrum_path, train_labels_path=train_labels_path, batch_size=1, transform=transform) test_data_loader = DataLoader( "test", test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path, batch_size=1, transform=transform) model = KNN(k=k, train_data_loader=train_data_loader, test_data_loader=test_data_loader) accuracy, true_labels, predictions = model.train() cross_validation_accuracies.append(accuracy * 100) cross_validation_true_labels.extend(true_labels) cross_validation_predictions.extend(predictions) print("k={}\tAccuracy: {:.3f}%".format(k, accuracy * 100)) shuffle_data_for_cross_validation( train_data_loader=train_data_loader, test_data_loader=test_data_loader, train_spectrum_path=train_spectrum_path, train_labels_path=train_labels_path, test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path) accuracies_mean = stat.mean(cross_validation_accuracies) accuracies_std = stat.stdev(cross_validation_accuracies) print("Test accuracies mean: {}".format(accuracies_mean)) print("Test accuracies standard deviation: {}".format( accuracies_std)) plot_data.plot_box_plot( test_accuracies=cross_validation_accuracies) plot_data.plot_confusion_matrix( true_labels=cross_validation_true_labels, predictions=cross_validation_predictions, fruits=fruits, show_null_values=True, show_plot=True) # for k in range(1, amount_train_data): # model = KNN(k=k, train_data_loader=train_data_loader, test_data_loader=test_data_loader) # accuracy = model.train() # print("k={}\tAccuracy: {:.3f}%".format(k, accuracy * 100)) else: train_data_loader_size_calculator = copy.deepcopy( train_data_loader) amount_train_data = 0 fruits_from_dataset = [] for spectrum, labels in train_data_loader_size_calculator.load_data( ): amount_train_data += spectrum.shape[0] # deprecated if fruits is None: for label in labels: if label not in fruits_from_dataset: fruits_from_dataset.append(label) # deprecated if fruits is None: fruit_label_enum = create_fruit_labels( fruits=fruits_from_dataset) fruits = fruits_from_dataset cross_validation_losses = [] cross_validation_accuracies_train = [] cross_validation_accuracies_test = [] cross_validation_true_labels = [] cross_validation_predictions_of_last_epoch = [] statistics_of_all_iterations = [] for i in range(cross_validation_iterations): print("Cross validation iteration: {}/{}".format( i + 1, cross_validation_iterations)) # Get the dataset train_data_loader = DataLoader( "train", train_spectrum_path=train_spectrum_path, train_labels_path=train_labels_path, batch_size=batch_size, transform=transform) test_data_loader = DataLoader( "test", test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path, batch_size=batch_size, transform=transform) # initialize the neural net model = CNN(amount_of_labels=len(fruit_label_enum), batch_normalization=batch_normalization, dropout=dropout, drop_prob=drop_prob, kernel_size=kernel_size, padding=padding, data_width=data_width, data_height=1, num_channels_layer1=num_channels_layer1, num_channels_layer2=num_channels_layer2, fc1_amount_output_nodes=fc1_amount_output_nodes, fc2_amount_output_nodes=fc2_amount_output_nodes, fc3_amount_output_node=fc3_amount_output_node, n_components=n_components) # train the model statistics = train_model( model=model, fruit_label_enum=fruit_label_enum, train_data_loader=train_data_loader, test_data_loader=test_data_loader, num_epochs=num_epochs, learning_rate=learning_rate, batch_size=batch_size, weight_decay=weight_decay, weight_decay_amount=weight_decay_amount, model_save_path=model_save_path, train_dataset_size=amount_train_data) statistics_of_all_iterations.append(statistics) losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics cross_validation_losses.extend([losses[-1]]) cross_validation_accuracies_train.extend( [accuracies_train[-1]]) cross_validation_accuracies_test.extend([accuracies_test[-1]]) cross_validation_true_labels.extend(true_labels), cross_validation_predictions_of_last_epoch.extend( list(predictions_of_last_epoch)) shuffle_data_for_cross_validation( train_data_loader=train_data_loader, test_data_loader=test_data_loader, train_spectrum_path=train_spectrum_path, train_labels_path=train_labels_path, test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path) accuracies_test_mean = stat.mean(cross_validation_accuracies_test) accuracies_test_std = stat.stdev(cross_validation_accuracies_test) print("Test accuracies mean: {}".format(accuracies_test_mean)) print("Test accuracies standard deviation: {}".format( accuracies_test_std)) # plot_data.plot_box_plot(test_accuracies=cross_validation_accuracies_test, show_plot=True) # plot the statistics if show_statistics: # plot_data.plot_train_statistics(x_values=range(len(losses)), y_values=losses, x_label="Epoch", # y_label="Loss") # plot_data.plot_train_statistics(x_values=range(len(accuracies_train)), y_values=accuracies_train, # x_label="Epoch", y_label="Train accuracy") # plot_data.plot_train_statistics(x_values=range(len(accuracies_test)), y_values=accuracies_test, # x_label="Epoch", y_label="Test accuracy") # max_test_accuracy = max(cross_validation_accuracies_test) # statistics = statistics_of_all_iterations[cross_validation_accuracies_test.index(max_test_accuracy)] # losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics # plot_data.plot_train_statistics1(losses=losses, train_accuracy=accuracies_train, # test_accuracy=accuracies_test) plot_data.plot_box_plot( test_accuracies=cross_validation_accuracies_test) plot_data.plot_confusion_matrix( true_labels=cross_validation_true_labels, predictions=cross_validation_predictions_of_last_epoch, fruits=fruits, show_null_values=True) plot_data.plot_classification_report( true_labels=cross_validation_true_labels, predictions=cross_validation_predictions_of_last_epoch, show_plot=True) max_test_accuracy = max(cross_validation_accuracies_test) statistics = statistics_of_all_iterations[ cross_validation_accuracies_test.index(max_test_accuracy)] losses, accuracies_train, accuracies_test, true_labels, predictions_of_last_epoch = statistics plot_data.plot_train_statistics1( losses=losses, train_accuracy=accuracies_train, test_accuracy=accuracies_test, show_plot=True) if predict_now: # fit the pca valid_files, _ = get_valid_and_invalid_files( root_dir=root_dir, validate_empty_file=validate_empty_file, validate_filename_format=validate_filename_format, validate_hierarchy=validate_hierarchy) # Get existing data existing_data, _ = get_existing_data(fruits=fruits, data_files=valid_files, sample_time=sample_time, sample_location=sample_location, data_width=data_width, sample_type=sample_type) # fit pca pca = PCA(n_components=n_components) pca = pca.fit(existing_data) model = load_model(model_save_path, amount_of_labels=len(fruit_label_enum), batch_normalization=batch_normalization, dropout=dropout, drop_prob=drop_prob, kernel_size=kernel_size, padding=padding, data_width=data_width, data_height=1, num_channels_layer1=num_channels_layer1, num_channels_layer2=num_channels_layer2, fc1_amount_output_nodes=fc1_amount_output_nodes, fc2_amount_output_nodes=fc2_amount_output_nodes, fc3_amount_output_node=fc3_amount_output_node, n_components=n_components) confidence, prediction = predict( model=model, data_file=file_to_predict, pca=pca, transform=transform, fruit_label_enum=fruit_label_enum, data_width=data_width, confidence_threshold=confidence_threshold) return confidence, prediction