def test_vc(): print('###') print('test_vc') print('###') neg_ftrs = np.zeros((32, 1)) pos_ftrs = np.ones((32, 1)) weights = np.random.randint(4, size=(32, 1)) features = weights.flatten() print('features.shape: ', features.shape) for i in range(50): weights_tmp = np.random.randint(4, size=(32, 1)) features = np.vstack((features, weights_tmp.flatten())) # labels = np.zeros(neg_ftrs.shape[0]) # labels = np.concatenate((labels, np.ones(pos_ftrs.shape[0]))) labels = np.random.randint(2, size=features.shape[0]) print('features.shape: ', features.shape) print('labels.shape: ', labels.shape) exit() x_train, x_valid, y_train, y_valid = train_test_split(features, labels, shuffle=True, stratify=labels, test_size=0.1, random_state=42) ftrs_sel = VarianceThreshold(threshold=(.8 * (1 - .8))) x_transformed = ftrs_sel.fit_transform(x_train) vc = create_voting_classifier() vc.fit(x_transformed, y_train) y_pred = vc.predict(x_valid) y_pred_train = vc.predict(x_train) acc = accuracy_score(y_valid, y_pred) print('accuracy: ', acc) plot_cm(y_valid, y_pred, 'images/test', 0.5)
def run_experiment(dataset_df, train_cols, option='validation', method='KNN', n_neighbors=3, hidden_layer_sizes=(100, ), n_trees=100, bagging=False, bagging_kwargs=DEFAULT_BAGGING_KWARGS, show_cm=True, **kwargs): if len(train_cols) == 0: raise Exception('No columns provided to run_experiment()') # Split train, val, test train, val, test = split_train_val_test(dataset_df) # Seleccionar columnas para entrenar x_train = train[train_cols] y_train = train['label'] if option == 'validation' or option == 'val': x_val = val[train_cols] y_val = val['label'] elif option == 'test': x_val = test[train_cols] y_val = test['label'] else: raise Exception(f'Option not recognized: {option}') # Elegir clasificador if method == 'KNN': model = KNN(n_neighbors=n_neighbors, **kwargs) elif method == 'SVM': model = SVC(**kwargs) elif method == 'MLP': model = MLP(hidden_layer_sizes=hidden_layer_sizes, **kwargs) elif method == 'RF': model = RandomForestClassifier(n_estimators=n_trees, **kwargs) elif method == 'LDA': model = LDA(**kwargs) else: raise Exception(f'Unkwown model: {method}') if bagging: model = BaggingClassifier(model, **bagging_kwargs) # Entrenar clasificador print('Training...') model.fit(x_train, y_train) # Evaluar modelo train_accuracy, train_cm = evaluate(model, x_train, y_train) val_accuracy, val_cm = evaluate(model, x_val, y_val) print(f'Accuracy: train: {train_accuracy}, {option}: {val_accuracy}') if show_cm: plot_cm(val_cm, title=f'{option} CM') return val_accuracy, val_cm
def classification(path_pos_class, path_neg_class, path_images): """ # Notes Classifies features using a Voting classifier with a 'soft' voting scheme. # Arguments - path_pos_class: string, path where the keras checkpoints with the weights are stored for the positive class images. - path_neg_class: string, path where the keras checkpoints with the weights are stored for the negative class images. - path_images: string, path where to save images (eg: data/images). """ features, labels = assemble_features_found(path_pos_class, path_neg_class, 256, 256, 4, 2) print('Number of features: ', features.shape) print('Number of labels: ', labels.shape) clf = ExtraTreesClassifier(n_estimators=50) params_ftrs_selector = grid_search_for_extra_tree(clf, features, labels) clf = ExtraTreesClassifier( n_estimators=50, criterion=params_ftrs_selector['criterion'], max_depth=params_ftrs_selector['max_depth'], min_samples_split=params_ftrs_selector['min_samples_split'], min_samples_leaf=params_ftrs_selector['min_samples_leaf'], max_features=params_ftrs_selector['max_features']) clf = clf.fit(features, labels) model = SelectFromModel(clf, prefit=True) x_transformed = model.transform(features) print('Number of features after selection: ', x_transformed.shape) x_train, x_valid, y_train, y_valid = train_test_split(x_transformed, labels, shuffle=True, stratify=labels, test_size=0.2, random_state=42) name_dt = 'dt' name_svm = 'svm' name_knn = 'knn' vc = create_voting_classifier(name_dt, name_knn, name_svm) params_grid = grid_search_for_vc(vc, x_train, y_train, name_dt, name_knn, name_svm) vc = create_voting_classifier(name_dt, name_knn, name_svm, params_grid) vc = vc.fit(x_train, y_train) print('x_valid.shape: ', x_valid.shape) y_pred = vc.predict(x_valid) acc = accuracy_score(y_valid, y_pred) print('Voting classifier accuracy(y_valid, y_pred): ', acc) plot_cm(y_valid, y_pred, path_images, 'valid', 0.5) y_pred_train = vc.predict(x_train) plot_cm(y_train, y_pred_train, path_images, 'train', 0.5) print('Voting classifier accuracy(y_train, y_pred_train): ', accuracy_score(y_train, y_pred_train))
def eval_saved_model(args): print('Loading data...') X_train, X_val, X_test, y_train, y_val, y_test = data.load_data( args.feature_extractor) X, y = (X_test, y_test) if args.is_test else (X_val, y_val) print('Loading model...') model = models.load_model(args.model_name) model_type = args.model_name[:3] # will be svm or mlp predictions = models.top_1_accuracy(model, X, y, args.exclude_mislabeled) if args.top_n: models.top_n_accuracy(model, X, y, args.top_n, model_type, args.exclude_mislabeled) if args.errors: print('Finding misclassifications...') cm = utils.calculate_cm(predictions, y) cm = utils.normalize_cm(cm) misclassifications(cm) if args.confusion_matrix: print('Creating confusion matrix...') cm_path = 'images/cm_' + args.model_name + '.png' utils.plot_cm(predictions, y, cm_path)
def train(train_dataset, valid_dataset, validation_bool, test_dataset, fam_dict_path, num_column, num_trains, num_tests, test_file_path, args): # load model model = rna_model.DeepRfam(seq_length=args.seq_length, num_c=num_column, num_filters=args.num_filters, filter_sizes=args.filter_sizes, dropout_rate=args.keep_prob, num_classes=args.num_classes, num_hidden=args.num_hidden) print(model.summary()) # model compile model.compile( loss=args.loss_function, optimizer=eval(f"optimizers.{args.optimizer}")(lr=args.learning_rate), metrics=['accuracy']) # start and record training history if validation_bool: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, validation_data=valid_dataset, use_multiprocessing=True, workers=6) else: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, use_multiprocessing=True, workers=6) # # test accuracy # t1 = time.time() # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1) # delta_t = time.time() - t1 # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # =================================logging============================================= local_time = time.strftime("%m-%d_%H-%M", time.localtime()) # determine log file name and `mkdir` if args.log_name is None: log_file_name = local_time else: log_file_name = local_time + '_' + args.log_name # os.system(f"mkdir -p {args.log_dir}/{log_file_name}") os.makedirs(f"{args.log_dir}/{log_file_name}") # save model to .h5 file model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5") # save the image of model structure plot_model(model, to_file=f"{args.log_dir}/{log_file_name}/model_structure.png", show_shapes=True) # save confusion matrix into .csv file # prediction = model.predict_generator(test_generator, workers=6, use_multiprocessing=True) prediction = model.predict_generator( test_generator) # don't use the multiprocessing # get the list of true label with open(test_file_path) as f: label_list = [] for line in f: line = line.strip() seq_index = line.split(',').pop(0) if seq_index != '': label_list.append(int(seq_index)) else: pass prediction = prediction[:len(label_list)] prediction_1d = np.array( [np.argmax(prediction) for prediction in prediction]) # print("Length of true label:", len(label_list)) # print("Length of predict label:", len(prediction_1d)) utils.cm2csv(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}") print('Accuracy:', accuracy_score(label_list, prediction_1d)) # generate the confusion matrix if args.num_classes <= 20: utils.plot_cm(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, title=f'Confusion Matrix', save_dir=f"{args.log_dir}/{log_file_name}") else: pass # draw and save history plot utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}") # save the classification report utils.classification_report(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}", std_out=True) # save history to .csv file with open(f"{args.log_dir}/history.csv", 'a') as csv: print( f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{args.num_filters},{args.batch_size},{args.num_epochs},{args.keep_prob},{args.num_hidden},{args.learning_rate},{args.loss_function},{args.optimizer}, ', file=csv)
args = parser.parse_args() cifar_dir = args.cifar_root fig_path = args.fig_path validation_split = args.val_split batch_size = args.batch_size epochs = args.epochs weight_path = args.weight_path weight_decay = args.weight_decay lr = args.lr SEED = args.seed # set random seed (default as 1234) # split train, val, test from `get_data` function train_loader, val_loader, test_loader = get_data(cifar_dir=cifar_dir, batch_size=batch_size, augment=True, validation_split=validation_split) # load model model = VGG_lite() # define loss loss = nn.CrossEntropyLoss() # train the model model, history = train(model, train_loader, val_loader, epochs, loss, batch_size, optimizer='adam', weight_decay=weight_decay, lr=lr) # save the model accordeing to `weight_path` from parser (default to './weights/final.pth') torch.save(model.state_dict(), weight_path) plot_history(history, fig_path) # save figures acc, cm, cm_norm = evaluate(model, test_loader) # evaluate trained model plot_cm(cm, cm_norm, fig_path) # save confusion matrix figures print('Test Accuracy: {}%'.format(round(acc*100, 4))) # print the model test accuracy
### Training ### model, history_training = train_model(model=model, hist=history_training, criterion=criterion, optimizer=optimizer, dataloaders=dataloaders, dataset_sizes=dataset_sizes, data_augment=DATA_AUGMENT, scheduler=lr_sched, num_epochs=EPOCHS, patience_es= 15) ### Testing ### history_training = test_model(model=model, hist=history_training, criterion=criterion, dataloaders=dataloaders, dataset_sizes=dataset_sizes) ### Save the model ### save_model(model=model, hist=history_training, trained_models_path=MODEL_PATH, model_type=MODEL_TYPE, do_save=SAVING) ### Plotting the losses ### plot_training(hist=history_training, graphs_path=GRAPHS_PATH, model_type=MODEL_TYPE, do_save=SAVING) ### Plotting the CM ### plot_cm(hist=history_training, graphs_path=GRAPHS_PATH, model_type=MODEL_TYPE, do_save=SAVING) ### Give the classification report ### classif_report(hist=history_training)
def task2(): # Create a MobileNet model mobile = MobileNet(weights='imagenet') # See a summary of the layers in the model mobile.summary() # Modify the model # Exclude the last 5 layers of the model x = mobile.layers[-6].output # Add a dropout and dense layer for predictions x = Dropout(0.25)(x) predictions = Dense(7, activation='softmax')(x) # Create a new model with the new outputs model = Model(inputs=mobile.input, outputs=predictions) # See a summary of the new layers in the model model.summary() # Freeze the weights of the layers that we aren't training (training the last 23) for layer in model.layers[:-23]: layer.trainable = False # Compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Useful variables data_folder = '../res/Task 2/Training' test_folder = '../res/Task 2/Test' total_train = 8012 total_test = 2003 labels = ["AK", "BCC", "BK", "D", "MN", "M", "VL"] batch_size = 100 epochs = 10 # this is the augmentation configuration we will use for training train_datagen = ImageDataGenerator( rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) # this is the augmentation configuration we will use for testing: # only rescaling test_datagen = ImageDataGenerator(rescale=1./255) train_generator = train_datagen.flow_from_directory( data_folder, class_mode='categorical', batch_size=batch_size, target_size=(224, 224),) test_generator = test_datagen.flow_from_directory( test_folder, class_mode='categorical', batch_size=batch_size, target_size=(224, 224)) # Try to deal with class imbalance: calculate class_weights so that the minority classes have a larger weight # than the majority classes. class_weights = class_weight.compute_class_weight( 'balanced', np.unique(train_generator.classes), train_generator.classes) class_weights = dict(enumerate(class_weights)) # Train the model model.fit_generator( train_generator, steps_per_epoch=total_train // batch_size, epochs=epochs, class_weight=class_weights ) # Evaluate the model accuracy with the testing dataset scores = model.evaluate_generator(test_generator, total_test // batch_size) print("Test accuracy = ", scores[1]) # Generate predictions with the test dataset # softmax returns a value for each class # the predicted class for a given sample will be the one that has the maximum value predictions = model.predict_generator(test_generator, total_test // batch_size + 1) y_pred = np.argmax(predictions, axis=1) # Save the predictions in a csv file with open('results2.csv', mode="w") as results_file: writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for x in predictions: writer.writerow(x) # Generate confusion matrix and classification report # Helps to evaluate metrics such as accuracy, precision, recall print('Confusion Matrix') cm = confusion_matrix(test_generator.classes, y_pred) print(cm) plot_cm(cm, labels, "second.png") print('Classification Report') print(classification_report(test_generator.classes, y_pred, target_names=labels))
def task1_CNN(): # Build the model model = Sequential() model.add(Conv2D(16, (3, 3), input_shape=(150, 150, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first")) model.add(BatchNormalization()) model.add(Conv2D(32, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first")) model.add(BatchNormalization()) model.add(Conv2D(64, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first")) # this converts our 3D feature maps to 1D feature vectors # add 1 dropout layer in order to prevent overfitting model.add(Flatten()) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(1)) model.add(Activation('sigmoid')) # compile the model model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # Useful variables data_folder = '../res/Task 1/Training' test_folder = '../res/Task 1/Test' total_train = 900 total_test = 379 batch_size = 100 # Higher batch size than usual in order to have a higher probability of encountering malignant samples in each batch epochs = 50 labels = ["bening", "malignant"] # this is the augmentation configuration we will use for training train_datagen = ImageDataGenerator(rescale=1. / 255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) train_generator = train_datagen.flow_from_directory( data_folder, class_mode='binary', batch_size=batch_size, target_size=(150, 150), ) # this is the augmentation configuration we will use for testing: # only rescaling test_datagen = ImageDataGenerator(rescale=1. / 255) test_generator = test_datagen.flow_from_directory( test_folder, class_mode='binary', batch_size=batch_size, target_size=(150, 150), ) # Try to deal with class imbalance: calculate class_weights so that the malignant class has a larger weight # than the bening class. counter = Counter(train_generator.classes) max_val = float(max(counter.values())) class_weights = { class_id: max_val / num_images for class_id, num_images in counter.items() } # Train the model model.fit_generator(train_generator, steps_per_epoch=total_train // batch_size, epochs=epochs, class_weight=class_weights) # Evaluate the model accuracy with the testing dataset scores = model.evaluate_generator(test_generator, total_test // batch_size) print("Test accuracy = ", scores[1]) # Generate predictions with the test dataset # sigmoid returns a value between 0 and 1, with 0.5 # if the value is lower than 0.5, then the model believes the sample is bening # if the value is bigger than 0.5, then the model believes the sample is malignant # The lower the value (close to 0), the most confidence the sample belongs to the bening class # The higher the value (close to 1), the most confidence the sample belongs to the malignant class predictions = model.predict_generator(test_generator, total_test // batch_size + 1) predicted_classes = [1 * (x[0] >= 0.5) for x in predictions] # Save the predictions in a csv file with open('results.csv', mode="w") as results_file: writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for x in predictions: writer.writerow(x) # Generate confusion matrix and classification report # Helps to evaluate metrics such as accuracy, precision, recall true_classes = test_generator.classes class_labels = list(test_generator.class_indices.keys()) print('Confusion Matrix') cm = confusion_matrix(true_classes, predicted_classes) print(cm) plot_cm(cm, labels, "first.png") print('Classification Report') print( classification_report(true_classes, predicted_classes, target_names=class_labels))
def train(train_dataset, valid_dataset, validation__bool, test_dataset, label_list, fam_path, num_channels, num_trains, num_valids, num_tests, args): # load model model = rna_model.L5CFam(seq_length=args.seq_length, num_filters=args.num_filters, num_channels=num_channels, filter_sizes=args.filter_sizes, dropout_rate=args.keep_prob, num_classes=args.num_classes, num_hidden=args.num_hidden) print(model.summary()) # model compile model.compile(loss=args.loss_function, optimizer=args.optimizer, metrics=['accuracy']) # start and record training history if validation__bool: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, validation_data=valid_dataset, workers=6, use_multiprocessing=True) else: train_history = model.fit_generator(train_dataset, epochs=args.num_epochs, verbose=1, workers=6, use_multiprocessing=True) # # test accuracy # t1 = time.time() # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1) # delta_t = time.time() - t1 # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}") # =================================logging============================================= local_time = time.strftime("%m-%d_%H-%M", time.localtime()) # determine log file name and `mkdir` if args.log_name is None: log_file_name = local_time else: log_file_name = local_time + '_' + args.log_name # os.system(f"mkdir -p {args.log_dir}/{log_file_name}") os.makedirs(f"{args.log_dir}/{log_file_name}") # save model to .h5 file model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5") # save the image of model structure plot_model(model, to_file=f"{args.log_dir}/{log_file_name}/model_structure.png", show_shapes=True) # save confusion matrix into .csv file prediction = model.predict_generator(test_dataset, workers=6, use_multiprocessing=True) prediction_1d = np.array( [np.argmax(prediction) for prediction in prediction]) # generate the list of the true label # label_list = np.zeros((num_tests,), dtype=int) # no_label = 0 # for i in range(1, num_tests): # if i % int(num_tests / args.num_classes) == 0: # no_label += 1 # label_list[i] = no_label utils.cm2csv(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_path, save_dir=f"{args.log_dir}/{log_file_name}") print('Accuracy:', accuracy_score(label_list, prediction_1d)) # draw and save history plot utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}") # generate the confusion matrix if args.num_classes <= 20: utils.plot_cm(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, title=f'Confusion Matrix', save_dir=f"{args.log_dir}/{log_file_name}") else: pass # save the classification report utils.classification_report(true_labels=label_list, predicted_labels=prediction_1d, dict_file=fam_dict_path, save_dir=f"{args.log_dir}/{log_file_name}", std_out=True) # save history to .csv file with open(f"{args.log_dir}/history.csv", 'a') as csv: print( f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{str(args.num_filters).replace(",","")},{args.batch_size},{args.num_epochs},{args.keep_prob},{str(args.num_hidden).replace(",","")},{args.learning_rate},{args.loss_function},{args.optimizer}, ', file=csv)