def fit(self, data, batch_size = 128, max_iterations = 100, save_interval = 10, path = "ann_weights.bin", return_cost = False): ''' Train neural network. ''' cost_history = [] self.best_w_h = self.w_h.get_value() self.best_w_o = self.w_o.get_value() best_auc = 0 for iteration in range(max_iterations): i = 0 for start, end in zip(range(0, len(data.train_x), batch_size), range(batch_size, len(data.train_x), batch_size)): cost = self._t_train(data.train_x[start:end], data.train_y[start:end]) i = i + 1 if i % save_interval == 0: self.save(path) if data.validation_y is not None: predicted_labels = self.predict_proba(data.validation_x)[:, 1] auc = compute_auc(np.argmax(data.validation_y, axis = 1), predicted_labels) if auc > best_auc: best_auc = auc self.best_w_h = self.w_h.get_value() self.best_w_o = self.w_o.get_value() else: if abs(best_auc - auc) < 0.000005: self.w_h.set_value(self.best_w_h) self.w_o.set_value(self.best_w_o) return self.save(path) print cost if return_cost: cost_history.append(cost) print cost return np.array(cost_history)
cvs = StratifiedKFold(homesite.train_y, n_folds = 5) clf = RandomForestClassifier(n_estimators = c, max_features = 100, n_jobs = 4) # Train classifier. print "\nTraining classifier param %d" % c for i, (train, test) in enumerate(cvs): sm = OverSampler(verbose = False, ratio = 2.5) train_oversampled_x, train_oversampled_train_y = sm.fit_transform(homesite.train_x[train], homesite.train_y[train]) probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba(homesite.train_x[test]) fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = compute_auc(homesite.train_y[test], probas_[:, 1]) fold_cm = confusion_matrix(homesite.train_y[test], np.round(probas_)[:, 1]) confusion_matrix_history = np.dstack((confusion_matrix_history, fold_cm)) accuracy, precision, recall = compute_performance_metrics(fold_cm) mean_acc += accuracy mean_recall += recall mean_precision += precision accuracy_history.append(accuracy) precision_history.append(precision) recall_history.append(recall) auc_history.append(roc_auc) save_np_array("../../results/random_forests/rf_accuracy_" + str(c) + ".bin", np.array(accuracy_history)) save_np_array("../../results/random_forests/rf_precision_" + str(c) + ".bin", np.array(precision_history))
''' oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(oversampled_path) del homesite.test_x # Deleted to save memory. print homesite.train_x.shape # Creating classifier. # clf = DecisionTreeClassifier() clf = RandomForestClassifier(max_features=100) # clf = AdaBoostClassifier(n_estimators = 10) # clf = svm.SVC(gamma = 0.00005) # clf = RandomForestClassifier() # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999) # clf = svm.LinearSVC() # Train classifier. print "Training classifier." clf.fit(homesite.train_x, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)
if __name__ == '__main__': ''' Train neural network. ''' # oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin" oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.bin" homesite_data = Data() homesite_data.load_sliptted_data(oversampled_path, one_hot = True) # Train neural network. clf = NeuralNetwork(input_units = 644, hidden_units = 50, output_units = 2, \ lr = 0.00005, lamb = 0.) # clf.fit(homesite_data, batch_size = 128, # max_iterations = 100, save_interval = 10, # path = "../homesite_data/ann_weights.bin") # Test neural network. # clf = NeuralNetwork(path = "../../homesite_data/ann_weights.bin", lr = 0.05, lamb = 0.000005) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(homesite_data.validation_x)[:, 1] # Show final results. results = confusion_matrix(np.argmax(homesite_data.validation_y, axis = 1), np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(np.argmax(homesite_data.validation_y, axis = 1), predicted_labels)