def performance(x_train, y_train, x_test, y_test, algorithm, n_neighbors=None, n_estimators=None, max_features=None, kernel=None, C=None, gamma=None, degree=None, coef0=None): # fit the model if algorithm == 'k-nn': model = KNeighborsClassifier(n_neighbors=int(n_neighbors)) model.fit(x_train, y_train) elif algorithm == 'SVM': model = train_svm(x_train, y_train, kernel, C, gamma, degree, coef0) elif algorithm == 'naive-bayes': model = GaussianNB() model.fit(x_train, y_train) elif algorithm == 'random-forest': model = RandomForestClassifier(n_estimators=int(n_estimators), max_features=int(max_features)) model.fit(x_train, y_train) else: raise ArgumentError('Unknown algorithm: %s' % algorithm) # predict the test set if algorithm == 'SVM': predictions = model.decision_function(x_test) else: predictions = model.predict_proba(x_test)[:, 1] return optunity.metrics.roc_auc(y_test, predictions, positive=True)
def KNN(file1, file2): feature1, lable1 = file2matrix(file1) neigh = KNeighborsClassifier(n_neighbors=1, warn_on_equidistant=False, weights="distance") neigh.fit(feature1, lable1) feature2, label2 = file2matrix(file2) y_true = label2 y_score = neigh.decision_function(feature2) y_pred = neigh.predict(feature2) return y_true, y_score, y_pred
def lat_log(data): months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] fire_class = ['B','C','D','E','F','G'] data = ex.classify_tag(data) data = data.replace(fire_class,[1,2,3,4,5,6]) x = data[['latitude','longitude']].to_numpy() y = data['natural'].to_numpy() model = KNeighborsClassifier(3) h = 0.02 ax = plt.subplot() x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 y_min, y_max = x[:, 1].min(), x[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) model.fit(x_train, y_train) score = model.score(x_test, y_test) if hasattr(model, "decision_function"): Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') plt.show()
#model = SVC(kernel="linear", C=0.025) #model = SVC(gamma=2, C=1) model = KNeighborsClassifier(10) #model = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True) #model = DecisionTreeClassifier(max_depth=5) #model = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1) #model = MLPClassifier(alpha=1) #model = AdaBoostClassifier() #model = GaussianNB() #model = QuadraticDiscriminantAnalysis() model.fit(X_train, Y_train) # cals predictions if hasattr(model, "decision_function"): Y_train_predicted = model.decision_function(X_train) Y_test_predicted = model.decision_function(X_test) else: Y_train_predicted = model.predict_proba(X_train)[:, 1] Y_test_predicted = model.predict_proba(X_test)[:, 1] fpr_train, tpr_train, _ = roc_curve(Y_train, Y_train_predicted) roc_auc_train = auc(fpr_train, tpr_train) fpr_test, tpr_test, _ = roc_curve(Y_test, Y_test_predicted) roc_auc_test = auc(fpr_test, tpr_test) natural_precision = precision_score(Y_test, [1] * len(Y_test)) natural_recall = recall_score(Y_test, [1] * len(Y_test)) threshold = numpy.linspace(0.01, 0.99, num=100)
class Classifier(): """ We assume that if no relevant_labels are supplied, single and double dot data have been extracted into the same file and labelled with labels specified in DOT_LABLE_MAPPING. """ def __init__( self, data_filenames: List[str], category: str, data_types: Optional[List[str]] = None, test_size: float = 0.2, classifier: Optional[str] = None, hyper_parameters: Dict[str, Union[str, float, int]] = {}, multi_class: bool = False, retained_variance: float = 0.99, name: Optional[str] = "", file_fractions: Optional[List[float]] = None, clf_params: Optional[Dict[str, Union[str, float, int]]] = None, feature_indexes: Optional[List[int]] = None, relevant_labels: Optional[List[int]] = None, ) -> None: if category not in ALLOWED_CATEGORIES: logger.error("Classifier category must be one " + "of {}".format(ALLOWED_CATEGORIES)) raise ValueError self.category = category if data_types is None: data_types = ["signal", "frequencies"] if "features" in data_types: if feature_indexes is None: self.feature_indexes = RELEVANT_FEATURE_INDEXES[category] else: self.feature_indexes = feature_indexes if relevant_labels is None: if category in DOT_LABLE_MAPPING.keys(): # logger.warning('Classifier: Assuming both data of dot' + # ' regimes are saved in ' + # '{}'.format(data_filenames)) relevant_labels = DOT_LABLE_MAPPING[category] else: relevant_labels = [0, 1] self.relevant_labels = sorted(relevant_labels) if file_fractions is None: file_fractions = [1.0] * len(data_filenames) self.file_fractions = file_fractions # Add default values to parameter dict in case some have not been # specified if classifier is None: default_params = {} else: default_params = DEFAULT_CLF_PARAMETERS[classifier] default_params.update(hyper_parameters) self.clf_params = default_params self.file_paths, name_addon = self._list_paths(data_filenames) self.name = name_addon + category + "_" if classifier is not None: self.name += classifier self.clf_type = classifier self.retained_variance = retained_variance self.data_types = data_types self.test_size = test_size self.multi_class = multi_class if classifier == "SVC": self.clf = svm.SVC(**self.clf_params) elif classifier == "LogisticRegression": self.clf = LogisticRegression(**self.clf_params) elif classifier == "LinearSVC": self.clf = svm.LinearSVC(**self.clf_params) elif classifier == "MLPClassifier": self.clf = MLPClassifier(**self.clf_params) elif classifier == "GaussianProcessClassifier": self.clf = GaussianProcessClassifier(**self.clf_params) elif classifier == "DecisionTreeClassifier": self.clf = DecisionTreeClassifier(**self.clf_params) elif classifier == "RandomForestClassifier": self.clf = RandomForestClassifier(**self.clf_params) elif classifier == "AdaBoostClassifier": self.clf = AdaBoostClassifier(**self.clf_params) elif classifier == "GaussianNB": self.clf = GaussianNB(**self.clf_params) elif classifier == "QuadraticDiscriminantAnalysis": self.clf = QuadraticDiscriminantAnalysis(**self.clf_params) elif classifier == "KNeighborsClassifier": self.clf = KNeighborsClassifier(**self.clf_params) else: self.clf = None (self.original_data, self.labels) = self.load_data(self.file_paths, self.data_types, file_fractions=self.file_fractions) def load_data( self, file_paths: List[str], data_types: List[str], file_fractions: Optional[List[float]] = [1.0], ) -> Tuple[np.ndarray, np.ndarray]: """ Load data from file and separate data from labels """ DATA_TYPE_MAPPING = dict(nt.config["core"]["data_types"]) len_2d = np.prod(nt.config["core"]["standard_shapes"]["2"]) + 1 all_the_stuff = np.empty([len(DATA_TYPE_MAPPING), 0, len_2d]) try: for ip in range(len(file_paths)): print(file_paths[ip]) sub_data = np.array(np.load(file_paths[ip], allow_pickle=True), dtype=np.float64) frac = file_fractions[ip] # type: ignore n_samples = int(round(sub_data.shape[1] * frac)) print("n_samples: {}".format(n_samples)) select = np.random.choice(sub_data.shape[1], n_samples, replace=False) sub_data = sub_data[:, select, :] all_the_stuff = np.concatenate([all_the_stuff, sub_data], axis=1) print("shape all_the_stuff: {}".format(all_the_stuff.shape)) except ValueError as v: len_1d = np.prod(nt.config["core"]["standard_shapes"]["1"]) + 1 all_the_stuff = np.empty([len(DATA_TYPE_MAPPING), 0, len_1d]) for ip in range(len(file_paths)): sub_data = np.array(np.load(file_paths[ip], allow_pickle=True), dtype=np.float64) frac = file_fractions[ip] # type: ignore n_samples = int(round(sub_data.shape[1] * frac)) select = np.random.choice(sub_data.shape[1], n_samples, replace=False) sub_data = sub_data[:, select, :] all_the_stuff = np.concatenate([all_the_stuff, sub_data], axis=1) labels = all_the_stuff[0, :, -1] data = all_the_stuff[:, :, :-1] relevant_data = np.empty((data.shape[1], 0)) for data_type in data_types: to_append = data[DATA_TYPE_MAPPING[data_type]] if data_type == "features": to_append[to_append == nt.config["core"] ["fill_value"]] = np.nan to_append = to_append[:, np.isfinite(to_append).any(axis=0)] try: to_append = to_append[:, self.feature_indexes] except IndexError as ie: logger.warning("Some data in {} ".format(file_paths) + "does not have the" + "feature requested. Make sure all data " + "has been fitted with appropriate" + " fit classes.") relevant_data = np.append(relevant_data, to_append, axis=1) # remove NaNs in labels mask = ~np.isnan(labels) relevant_data = relevant_data[mask, :] labels = labels[mask] # remove NaNs in data mask = np.isfinite(relevant_data).any(axis=-1) relevant_data = relevant_data[mask] labels = labels[mask].astype(int) relevant_data, labels = self.separate_data(relevant_data, labels) return relevant_data, labels def separate_data( self, data: np.ndarray, labels: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray]: """ Extract sub data depending on which labels we would like to predict """ shape = data.shape relevant_data = np.empty((0, shape[1])) relevant_labels = np.empty([0]) for il, label in enumerate(self.relevant_labels): relevant_indx = np.where(labels == label)[0] relevant_data = np.concatenate( [relevant_data, data[relevant_indx, :]], axis=0) relevant_labels = np.concatenate( [relevant_labels, np.ones(len(relevant_indx)) * il], axis=0) return relevant_data, relevant_labels def train( self, data: Optional[np.ndarray] = None, labels: Optional[np.ndarray] = None, ) -> None: """""" if data is None: data = self.original_data labels = self.labels (data_to_use, labels_to_use) = self.select_equal_populations(data, labels) X_train, _ = self.prep_data(train_data=data_to_use) self.clf.fit(X_train, labels_to_use) def prep_data( self, train_data: Optional[np.ndarray] = None, test_data: Optional[np.ndarray] = None, perform_pca: Optional[bool] = False, scale_pc: Optional[bool] = False, ) -> Tuple[np.ndarray, np.ndarray]: """ scale and extract principle components """ (train_data, test_data) = self.scale_raw_data(train_data=train_data, test_data=test_data) if perform_pca: (train_data, test_data) = self.get_principle_components(train_data=train_data, test_data=test_data) if scale_pc: (train_data, test_data) = self.scale_compressed_data(train_data=train_data, test_data=test_data) return train_data, test_data def select_equal_populations( self, data: np.ndarray, labels: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray]: """ Make sure we have 50% of one and 50% of other population """ # self.data_to_use = copy.deepcopy(self.original_data) populations_labels, population_counts = np.unique(labels, return_counts=True) n_each = int(np.min(population_counts)) new_data = np.empty([n_each * len(populations_labels), data.shape[-1]]) new_labels = np.empty(n_each * len(populations_labels), int) for ii, label in enumerate(populations_labels): idx = np.where(labels == int(label)) idx = np.random.choice(idx[0], n_each, replace=False) idx = idx.astype(int) dat = data[idx] new_data[ii * n_each:(ii + 1) * n_each] = dat label_array = np.ones(n_each, dtype=int) * int(label) new_labels[ii * n_each:(ii + 1) * n_each] = label_array p = np.random.permutation(len(new_labels)) return new_data[p], new_labels[p] def split_data( self, data: np.ndarray, labels: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: (train_data, test_data, train_labels, test_labels) = train_test_split(data, labels, test_size=self.test_size, random_state=0) return train_data, test_data, train_labels, test_labels def scale_raw_data( self, train_data: Optional[np.ndarray] = None, test_data: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """""" if train_data is not None: self.raw_scaler = StandardScaler() self.raw_scaler.fit(train_data) train_data = self.raw_scaler.transform(train_data) if test_data is not None: if hasattr(self, "raw_scaler"): test_data = self.raw_scaler.transform(test_data) else: logger.error("Scale train data before scaling test data.") raise AttributeError return train_data, test_data def scale_compressed_data( self, train_data: Optional[np.ndarray] = None, test_data: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """""" if train_data is not None: self.compressed_scaler = StandardScaler() self.compressed_scaler.fit(train_data) train_data = self.compressed_scaler.transform(train_data) if test_data is not None: if hasattr(self, "compressed_scaler"): test_data = self.compressed_scaler.transform(test_data) else: logger.error("Train data principle components has not been" + " have to be scaled before scaling test PC.") raise AttributeError return train_data, test_data def get_principle_components( self, train_data: Optional[np.ndarray] = None, test_data: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray]: """""" if train_data is not None: self.pca = PCA(self.retained_variance) self.pca.fit(train_data) train_data = self.pca.transform(train_data) if test_data is not None: if hasattr(self, "pca"): test_data = self.pca.transform(test_data) else: logger.error("Compress train data before compressing test" + " data.") raise AttributeError return train_data, test_data def score(self, test_data: np.ndarray, test_labels: np.ndarray) -> float: """""" self.clf_score = self.clf.score(test_data, test_labels) return self.clf_score def predict( self, dataid: int, db_name: str, db_folder: Optional[str] = None, ) -> np.ndarray: """""" if db_folder is None: db_folder = nt.config["db_folder"] DATA_TYPE_MAPPING = dict(nt.config["core"]["data_types"]) df = Dataset(dataid, db_name) condensed_data_all = prep_data(df, self.category) predictions = [] for condensed_data in condensed_data_all: relevant_data = np.empty((1, 0)) for data_type in self.data_types: to_append = condensed_data[DATA_TYPE_MAPPING[data_type]] if data_type == "features": to_append[to_append == nt.config["core"] ["fill_value"]] = np.nan to_append = to_append[:, np.isfinite(to_append).any(axis=0)] try: to_append = to_append[:, self.feature_indexes] except IndexError as ie: logger.warning( "Some data in {} ".format(dataid) + "does not have the " + "feature requested. Make sure all data " + "has been " + "fitted with appropriate fit " + "classes.") relevant_data = np.append(relevant_data, to_append, axis=1) _, relevant_data = self.prep_data(test_data=relevant_data) predictions.append(self.clf.predict(relevant_data)) return predictions def compute_ROC( self, data_types: Optional[List[str]] = None, n_splits: int = 10, n_population_subselect: int = 10, save_to_file: bool = False, path: Optional[str] = None, ) -> Dict[str, Any]: """ Compute ROC for multiple train and test datasets """ if path is None: path = os.path.join(nt.config["db_folder"], "ROC") if data_types is None: data_types = self.data_types cv = StratifiedKFold(n_splits=n_splits) result: Dict[str, Any] = {} # save additional info result["metadata"] = {} result["metadata"]["n_splits"] = n_splits result["metadata"]["n_population_subselect"] = n_population_subselect result["metadata"]["file_paths"] = self.file_paths result["metadata"]["clf_params"] = self.clf.get_params() for data_type in data_types: result[data_type] = {} # this line below could be made more efficient and not load # from file every time (self.original_data, self.labels) = self.load_data(self.file_paths, [data_type], file_fractions=self.file_fractions) tprs: List[List[float]] = [] aucs: List[float] = [] for redraw_iter in range(n_population_subselect): # tprs = [] # aucs = [] mean_fpr = np.linspace(0, 1, 100) X, y = self.select_equal_populations(self.original_data, self.labels) for train, test in cv.split(X, y): # scale data as we would in real life X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] X_train, X_test = self.prep_data(train_data=X_train, test_data=X_test) # probas = self.clf.fit(X[train], y[train]).predict_proba(X[test]) probas = self.clf.fit(X_train, y_train).predict_proba(X_test) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1]) if np.all(np.diff(fpr) >= 0): # interpolate curve to np.linspace(0, 1, 100) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 # compute area under the roc curve: roc_auc = auc(fpr, tpr) aucs.append(roc_auc) else: logger.warning("Averaging over less than the " + " desired number of train-test splits.") mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) std_tpr = np.std(tprs, axis=0) result[data_type]["mean_fpr"] = mean_fpr result[data_type]["mean_tpr"] = mean_tpr result[data_type]["std_tpr"] = std_tpr result[data_type]["mean_auc"] = mean_auc result[data_type]["std_auc"] = std_auc if save_to_file: name = "ROC_" + self.name if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, name) np.save(path, result) return result def compute_ROC_different_source( self, train_files: List[str], test_files: List[str], data_types: Optional[List[str]] = None, save_to_file: Optional[bool] = False, path: Optional[str] = None, ) -> Dict[str, Any]: """ Load different data for training """ if path is None: path = os.path.join(nt.config["db_folder"], "ROC") if data_types is None: logger.warning("No data types specified. No ROC will be computed.") train_files_full, _ = self._list_paths(train_files) test_files_full, _ = self._list_paths(test_files) result: Dict[str, Any] = {} for data_type in data_types: # type: ignore result[data_type] = {} ff = self.file_fractions train_data, train_labels = self.load_data(train_files_full, [data_type], file_fractions=ff) self.train(train_data, train_labels) test_data, test_labels = self.load_data(test_files_full, [data_type], file_fractions=ff) (test_data, test_labels) = self.select_equal_populations( test_data, test_labels) _, test_data = self.prep_data(test_data=test_data) probas = self.clf.predict_proba(test_data) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(test_labels, probas[:, 1]) mean_fpr = np.linspace(0, 1, 100) if np.all(np.diff(fpr) >= 0): roc_auc = auc(fpr, tpr) tpr = interp(mean_fpr, fpr, tpr) tpr[0] = 0.0 else: logger.error("Unable to compute ROC with different test " + "and train files.") result[data_type]["mean_fpr"] = mean_fpr result[data_type]["mean_tpr"] = tpr result[data_type]["std_tpr"] = 0 result[data_type]["mean_auc"] = roc_auc result[data_type]["std_auc"] = 0 result["metadata"] = {} result["metadata"]["n_splits"] = 1 result["metadata"]["n_population_subselect"] = 1 result["metadata"]["train_files"] = train_files_full result["metadata"]["test_files"] = test_files_full result["metadata"]["file_fractions"] = self.file_fractions result["metadata"]["clf_params"] = self.clf.get_params() if save_to_file: self.name = "ROC" + self.name + "_train" for train_file in train_files: self.name += os.path.splitext(train_file)[0] self.name += "_test" for test_file in test_files: self.name += os.path.splitext(test_file)[0] if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, self.name) np.save(path, result) return result def plot_ROC( self, roc_result: Optional[Dict[str, Any]] = None, save_to_file: bool = False, path: Optional[str] = None, ) -> str: """ Plot ROC and return path where the figure was saved """ if path is None: path = os.path.join(nt.config["db_folder"], "ROC") if roc_result is None: logger.warning("No ROC result dict given. Computing one with " + "default values.") roc_result = self.compute_ROC(save_to_file=True) _ = plt.figure() for d_type, res in roc_result.items(): if d_type != "metadata": mean_fpr = res["mean_fpr"] mean_tpr = res["mean_tpr"] std_tpr = res["std_tpr"] mean_auc = res["mean_auc"] std_auc = res["std_auc"] label = r"Mean ROC (AUC = %0.2f $\pm$ %0.2f) " % (mean_auc, std_auc) label += d_type plt.plot(mean_fpr, mean_tpr, label=label, lw=2, alpha=0.8) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between( mean_fpr, tprs_lower, tprs_upper, color="grey", alpha=0.2, ) # plot the last std twice to have a legend entry (and only one) plt.fill_between( mean_fpr, tprs_lower, tprs_upper, color="grey", alpha=0.2, label=r"$\pm$ 1 std. dev.", ) plt.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver Operating Characteristic " + self.name) plt.legend(loc="lower right", bbox_to_anchor=(1, 0)) filename = "ROC_" + self.name path1 = os.path.join(path, filename + ".eps") plt.savefig(path1, format="eps", dpi=600) path2 = os.path.join(path, filename + ".png") plt.savefig(path2, format="png", dpi=600) return path2 def compute_metrics( self, n_iter: Optional[int] = None, n_test: int = 100, save_to_file: bool = True, filename: str = "", supp_train_data: Optional[List[str]] = None, n_supp_train: Optional[int] = None, perform_pca: bool = False, scale_pc: bool = False, ) -> Tuple[Dict[str, Dict[str, Any]], np.ndarray]: """""" if n_iter is None: n_iter = DEFAULT_N_ITER[self.category] metrics = np.empty([len(METRIC_NAMES), n_iter]) conf_matrix = [] start_time = time.time() train_times = [] test_times = [] if supp_train_data is not None: supp_train_data, name_addon = self._list_paths(supp_train_data) f_frac = [1.0] * len(supp_train_data) (train_data_addon, train_labels_addon) = self.load_data(supp_train_data, self.data_types, file_fractions=f_frac) if n_supp_train is None: n_supp_train = train_data_addon.shape[0] mask = [1] * n_supp_train mask = mask + [0] * (train_data_addon.shape[0] - n_supp_train) mask = np.array(mask, dtype=bool) np.random.shuffle(mask) train_data_addon = train_data_addon[mask] train_labels_addon = train_labels_addon[mask] else: train_data_addon = None train_labels_addon = None for curr_iter in range(n_iter): start_time_inner = time.time() (data_to_use, labels_to_use) = self.select_equal_populations( self.original_data, self.labels) (train_data, test_data, train_labels, test_labels) = self.split_data(data_to_use, labels_to_use) if train_data_addon is not None: # print(train_data_addon.shape) # print(train_data.shape) # print(train_labels_addon.shape) # print(train_labels.shape) train_data = np.concatenate([train_data, train_data_addon], axis=0) train_labels = np.concatenate( [train_labels, train_labels_addon], axis=0) X_train, X_test = self.prep_data( train_data=train_data, test_data=test_data, perform_pca=perform_pca, scale_pc=scale_pc, ) probas = self.clf.fit(X_train, train_labels).predict_proba(X_test) train_times.append(time.time() - start_time_inner) fpr, tpr, thresholds = roc_curve(test_labels, probas[:, 1]) start_time_inner = time.time() for itt in range(n_test): pred_labels = self.clf.predict(X_test) test_times.append((time.time() - start_time_inner) / n_test) m_in = METRIC_NAMES.index("accuracy_score") metrics[m_in, curr_iter] = accuracy_score(test_labels, pred_labels) m_in = METRIC_NAMES.index("brier_score_loss") metrics[m_in, curr_iter] = brier_score_loss(test_labels, pred_labels) m_in = METRIC_NAMES.index("auc") metrics[m_in, curr_iter] = auc(fpr, tpr) if hasattr(self.clf, "decision_function"): y_score = self.clf.decision_function(X_test) else: y_score = self.clf.predict_proba(X_test)[:, 1] m_in = METRIC_NAMES.index("average_precision_recall") metrics[m_in, curr_iter] = average_precision_score(test_labels, y_score) conf_matrix.append(confusion_matrix(test_labels, pred_labels)) elapsed_time = (time.time() - start_time) / n_iter conf_matrix = np.array(conf_matrix) info_dict: Dict[str, Any] = { "n_iter": n_iter, "classifier": self.clf_type, "category": self.category, "data_files": self.file_paths, "data_types": self.data_types, "hyper_parameters": self.clf.get_params(), "metric_names": METRIC_NAMES, "elapsed_time [s/iter]": elapsed_time, "n_test": test_data.shape[0], "n_train": train_data.shape[0], "mean_train_time": np.mean(train_times), "std_train_time": np.std(train_times), "mean_test_time": np.mean(test_times), "std_test_time": np.std(test_times), "perform_pca": perform_pca, "scale_pc": scale_pc, "metadata": {}, "supp_train_data": supp_train_data, } for im, metric_name in enumerate(METRIC_NAMES): info_dict[metric_name] = { "std": np.std(metrics[im]), "mean": np.mean(metrics[im]), } info_dict["confusion_matrix"] = { "std": np.std(conf_matrix, axis=0).tolist(), "mean": np.mean(conf_matrix, axis=0).tolist(), } if save_to_file: if not filename: filename = self.name + "_" if supp_train_data is not None: filename = filename + name_addon + "_" filename += "_".join(self.data_types) if perform_pca: filename += "_PCA" if scale_pc: filename += "_scaled" filename += ".json" path = os.path.join(nt.config["db_folder"], "classifier_metrics") if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, filename) with open(path, "w") as f: json.dump(info_dict, f) return info_dict, metrics def display_metrics( self, info_dict: Dict[str, Dict[str, float]], all_of_it: Optional[bool] = False, ) -> None: """""" inf_t = PrettyTable(["parameter", "value"]) for key in info_dict.keys(): if key not in METRIC_NAMES and key != "metric_names": inf_t.add_row([key, info_dict[key]]) t = PrettyTable(["metric", "mean", "std"]) for mn in METRIC_NAMES: t.add_row([ mn, "{0:.3f}".format(info_dict[mn]["mean"]), "{0:.3f}".format(info_dict[mn]["std"]), ]) t.add_row([ mn, np.array(info_dict["confusion_matrix"]["mean"]), np.array(info_dict["confusion_matrix"]["std"]), ]) if all_of_it: print(inf_t) print(t) def determine_number_of_redraws( self, n_max_iter: int = 200, perform_pca: bool = False, scale_pc: bool = False, save_to_file: bool = True, data_folder: Optional[str] = None, figure_folder: Optional[str] = None, filename: Optional[str] = None, ) -> Dict[str, Dict[str, Any]]: """ Skipping confusion matrix """ # TODO: change method name to better reflect what it actually does. means = np.empty((len(METRIC_NAMES), n_max_iter)) stds = np.empty((len(METRIC_NAMES), n_max_iter)) info_dict, metrics = self.compute_metrics( n_iter=n_max_iter, n_test=1, save_to_file=False, perform_pca=perform_pca, scale_pc=scale_pc, ) for n_eval in range(n_max_iter): for m_id, metric in enumerate(METRIC_NAMES): means[m_id, n_eval] = np.mean(metrics[m_id, 0:n_eval + 1]) stds[m_id, n_eval] = np.std(metrics[m_id, 0:n_eval + 1]) info_dict["mean_metric_variations"] = means.tolist() info_dict["std_metric_variations"] = stds.tolist() info_dict["metadata"]["metric_names"] = "Skip confusion matrix" if save_to_file: if filename is None: filename = "metric_fluctuations_" + self.name filename = filename + "_" + "_".join(self.data_types) if perform_pca: filename += "_PCA" if scale_pc: filename += "_scaled" if data_folder is None: data_folder = os.path.join(nt.config["db_folder"], "classifier_stats") if not os.path.exists(data_folder): os.makedirs(data_folder) data_file = os.path.join(data_folder, filename + ".json") with open(data_file, "w") as f: json.dump(info_dict, f) return info_dict # def tune_hyper_parameters(self, # ) -> Dict: # """ # """ # return best_params def _list_paths(self, filenames: List[str]) -> Tuple[List[str], str]: """ add path to file names """ file_paths = [] name = "" for filename in filenames: p = os.path.join(nt.config["db_folder"], filename) file_paths.append(p) name = name + os.path.splitext(filename)[0] + "_" return file_paths, name
class classification: """ Contains SGD and SVC classification classifiers. Return prediction value and prediction scores. """ def __init__(self, X=0, labels=0, name='sgd', rand=42): self.X = X self.labels = labels self.name = name self.model = [] self.rand = rand def init(self, val=0, degree_val=2): # Pre-processing data if val == 1: from sklearn.preprocessing import StandardScaler return (StandardScaler()) elif val == 2: from sklearn.preprocessing import PolynomialFeatures return (PolynomialFeatures(degree=degree_val, include_bias=False)) def init_fit(self, c_val=1, depth=2): if self.name == 'sgd': from sklearn.linear_model import SGDClassifier self.model = SGDClassifier(random_state=self.rand) elif self.name == 'decis_tree': from sklearn.tree import DecisionTreeClassifier print('Depth == ', depth) self.model = DecisionTreeClassifier(max_depth=depth) elif self.name == 'rand_forest': from sklearn.ensemble import RandomForestClassifier print('Lots of parameters already set!') self.model = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1) elif self.name == 'svm': from sklearn.svm import SVC self.model = SVC() elif self.name == 'poly_kernel': from sklearn.svm import SVC self.model = SVC(kernel="poly", degree=3, coef0=1, C=5) elif self.name == 'gauss_rbf_kernel': from sklearn.svm import SVC self.model = SVC(kernel="rbf", gamma=5, C=10) elif self.name == 'linear_svm': from sklearn.svm import LinearSVC self.model = LinearSVC(C=c_val, loss="hinge") elif self.name == 'kneighbors': from sklearn.neighbors import KNeighborsClassifier self.model = KNeighborsClassifier() else: print('No correct model given.') print('Try again with sgd, rand_forest, svc, kneighbors') return () return (self.model) def predictor(self, predict_val): return_val = self.model.predict(predict_val) return (return_val) def predict_percent(self, predict_val): pred_percent = self.model.predict_proba(predict_val) def predict_scores(self, predict_val): scores = self.model.decision_function(predict_val) return (scores) def cross_valid(self, num=5): from sklearn.model_selection import cross_val_score cross_scores = cross_val_score(self.model, self.X, self.labels, cv=num, scoring="accuracy") return (cross_scores) def confus_mat(self, num=5): from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix y_pred = cross_val_predict(self.model, self.X, self.labels, cv=num) mat_scores = confusion_matrix(self.labels, y_pred) return (mat_scores) def prec_recall(self, num=5): from sklearn.model_selection import cross_val_predict from sklearn.metrics import precision_score, recall_score y_pred = cross_val_predict(self.model, self.X, self.labels, cv=num) prec = precision_score(self.labels, y_pred) recall = recall_score(self.labels, y_pred) return (prec, recall) def accuracy(self, test, x_test, y_test): from sklearn.metrics import accuracy_score self.model.fit(self.X, self.labels) y_pred = self.model.predict(x_test) val = accuracy_score(y_test, y_pred) return accuracy_score(y_test, y_pred)
clf.fit(X, y) # s is the size of points plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired) # gca stands for 'get current axis'. ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() xx = np.linspace(xlim[0], xlim[1], 30) yy = np.linspace(ylim[0], ylim[1], 30) # Return coordinate matrices from coordinate vectors. YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = clf.decision_function(xy).reshape(XX.shape) # paint the boundary ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none') plt.show()
def do_svm(loaded_data, split_n, runParams): # loaded_data = load_split(args.input_dir, args.train_file, args.test_file) print "Fitting SVM to data - train data %s, test data %s" \ % (str(loaded_data.train_patches.shape), str(loaded_data.test_patches.shape)) if runParams.pca_dims: PCA_dims = runParams.pca_dims print "Will perform PCA to reduce dimensions to %d" % PCA_dims start = time.time() pca = PCA(n_components=PCA_dims) pca.fit(loaded_data.train_patches) print "PCA computed, now transforming" train_data = pca.transform(loaded_data.train_patches) test_data = pca.transform(loaded_data.train_patches) end = time.time() print "It took %f seconds to perform PCA" % (end - start) print "Fitting SVM to data - train data %s, test data %s" \ % (str(train_data.shape), str(test_data.shape)) else: train_data = loaded_data.train_patches test_data = loaded_data.test_patches print "Feature mean %f and std %f" % (train_data.mean(), train_data.std()) start = time.time() if runParams.SelectKBest: anova_filter = SelectKBest(f_regression, k=runParams.SelectKBest) clf = svm.LinearSVC(dual=False, C=runParams.C) anova_svm = make_pipeline(VarianceThreshold(), anova_filter, clf) anova_svm.fit(train_data, loaded_data.train_labels) res = anova_svm.predict(test_data) else: dual = train_data.shape[0] < train_data.shape[1] if runParams.doKNN is False: print "Svm params: C: %f, dual: %s, penalty %s" % ( runParams.C, str(dual), runParams.penalty) clf = svm.LinearSVC( dual=dual, C=runParams.C, penalty=runParams.penalty) # C=0.00001 good for JHUIT else: print "Running KNN" clf = KNeighborsClassifier(n_neighbors=3) clf.fit(train_data, loaded_data.train_labels) res = clf.predict(test_data) if runParams.saveMargin: if runParams.SelectKBest: Margins = anova_svm.decision_function(test_data) else: Margins = clf.decision_function(test_data) filemargins = open(runParams.saveMargin + '_split' + str(split_n), 'w') ftest_labels = open( 'test_labels' + '_split' + str(split_n), 'w') #enable only if loaded_data.test_labels change pickle.dump(loaded_data.test_labels, ftest_labels) pickle.dump(Margins, filemargins) filemargins.close() confusion = confusion_matrix(loaded_data.test_labels, res) if conf_path is not None: np.savetxt(conf_path + '_' + str(split_n) + '.csv', confusion) correct = (res == loaded_data.test_labels).sum() end = time.time() print "Split " + str(split_n) + " Got " + str((100.0 * correct) / loaded_data.test_labels.size) \ + "% correct, took " + str(end - start) + " seconds " return ((100.0 * correct) / loaded_data.test_labels.size), clf
# plt.arrow(x[0], x[1], X[neighbor_index, 0] - x[0], X[neighbor_index, 1] - x[1], # head_width=0, fc='k', ec='k') # # test_points = discrete_scatter(X_test[:, 0], X_test[:, 1], clf.predict(X_test), markers="*") # training_points = discrete_scatter(X[:, 0], X[:, 1], y) # plt.legend(training_points + test_points, ["training class 0", "training class 1", # "test pred 0", "test pred 1"]) # plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) test_predictions = clf.predict(X_test) print 'Test set predictions:', test_predictions print 'Test accuracy:', clf.score(X_test, y_test) fig, axes = plt.subplots(1, 3, figsize=(10, 3)) for n_neighbors, ax in zip([1, 3, 9], axes): clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) clf.decision_function() mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4) mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) ax.set_title('{} neighbor(s)'.format(n_neighbors)) ax.set_xlabel('feature 0') ax.set_ylabel('feature 1') axes[0].legend(loc=3) plt.show()