def train_model(model: ClassifierMixin, data_time_range: List[str], output_path: str): es_host = ESConnection(es_host='http://localhost:9200') dataset = ml_utils.get_data(start_time=data_time_range[0], end_time=data_time_range[1], es_host=es_host) dataset.to_pickle('data/dataset.pkl') dataset = pd.read_pickle('data/dataset.pkl') print(len(dataset.columns)) y = dataset['target'] X = dataset.drop(columns=['target']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17) print('Training model') model = model.fit(X_train, y_train) print('Finished training') prediction = model.predict(X_test) print(confusion_matrix(y_test, prediction)) dump(model, output_path + '/' + type(model).__name__ + '.joblib')
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series, mapper_dict: Dict) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func)\ -> (float, float, float, float, float): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, random_state=0, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='weighted') buffer_test = preprocessing.minmax_scale(transformed_dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') print('sample_score is done') k5_score = kfold_cv(5, nptraining, nptarget, model) print('k5_score is done') k10_score = kfold_cv(10, nptraining, nptarget, model) print('k10_score is done') k20_score = kfold_cv(20, nptraining, nptarget, model) print('k20_score is done') random_score = random_sampling_cv(nptraining, nptarget, model) return simple_score, k5_score, k10_score, k20_score, random_score
def random_sampling_cv(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin) -> float: x_train, x_test, y_train, y_test = model_selection.train_test_split( dataset, answers, shuffle=True, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) f1_score = metrics.f1_score(y_test, prediction, average='weighted') return f1_score
def evaluate_on_datasets(predictor: ClassifierMixin, datasets): y_preds = [] mean_kappa = [] for i, (x, y_true) in enumerate(datasets): y_pred = predictor.predict(x) y_preds.append(y_pred) kappa_hold = cohen_kappa_score(y_true, y_pred, weights='quadratic') mean_kappa.append(kappa_hold) print(np.mean(mean_kappa), mean_kappa) return y_preds
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func) \ -> (int, int): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, test_size=0.25, random_state=0) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='binary') buffer_test = preprocessing.minmax_scale(dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') k5_score = kfold_cv(5, nptraining, nptarget, model, True) return simple_score, k5_score
def bootstrap_accuracy( f: ClassifierMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ return bootstrap_measure( f, X, y, num_samples=num_samples, random_state=random_state, predict=lambda f, X: f.predict(X), measure=accuracy_score, )
def bootstrap_accuracy( f: ClassifierMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2**32 - 1), ) -> List[float]: """ Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore (predict not on ClassifierMixin) # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample(y_pred, y, random_state=trial + random_state) # type:ignore score = accuracy_score(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist
def cross_validation(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin, cross_validator: model_selection.BaseCrossValidator, save_worst_data: bool) -> float: iteration_counter: int = 0 f1_score_value = 0 worst_f1_score_value = 1.0 worst_predicted = None worst_actual = None for train_index, test_index in cross_validator.split(dataset, answers): train_x, test_x = dataset[train_index], dataset[test_index] train_y, test_y = answers[train_index], answers[test_index] iteration_counter += 1 # Train model.fit(train_x, train_y) # Test predicted = model.predict(test_x) # Evaluate f1_iteration_score_value = metrics.f1_score(test_y, predicted, average='weighted') if f1_iteration_score_value <= worst_f1_score_value: worst_f1_score_value = f1_iteration_score_value worst_predicted = predicted worst_actual = test_y f1_score_value += f1_iteration_score_value if save_worst_data: np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted) np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual) return f1_score_value / iteration_counter
def train_and_save(classifier: ClassifierMixin, dataset: str, transforms: List[str], bundled: bool, test_proportion: int = 0.1) -> None: """ Trains on the given dataset and saves model. :param classifier: The classifier to train. :param dataset: The dataset to train on. :param transforms: The transforms to apply to the data. :param bundled: Whether to bundle chart classes together. :param test_proportion: What percentage of the dataset to use for testing. :return: None. """ if not make_data(dataset, transforms, bundled): raise FileNotFoundError images, labels = np.load(f"{dataset}/X.npy"), np.load(f"{dataset}/Y.npy") X_train, X_test, Y_train, Y_test = \ train_test_split(images, labels, test_size=test_proportion) classifier.fit(X_train, Y_train) pred = classifier.predict(X_test) print(classification_report(Y_test, pred)) print(pd.DataFrame(confusion_matrix(Y_test, pred))) joblib.dump(classifier, f"{dataset}/model.joblib")
def plot_decision_boundary( X: pd.DataFrame, y: pd.Series, clf: ClassifierMixin = sklearn.linear_model.LogisticRegression(), title: str = "Decision Boundary Logistic Regression", legend_title: str = "Legend", h: float = 0.05, figsize: tuple = (11.7, 8.27), ): """Generate a simple plot of the decision boundary of a classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) Classifier vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) Target relative to X for classification. Datatype should be integers. clf : scikit-learn algorithm An object that has the `predict` and `predict_proba` methods h : int (default: 0.05) Step size in the mesh title : string Title for the plot. legend_title : string Legend title for the plot. figsize: tuple (default: (11.7, 8.27)) Width and height of the figure in inches Returns ------- boundaries: Figure Properties of the figure can be changed later, e.g. use `boundaries.axes[0].set_ylim(0,100)` to change ylim ax: Axes The axes associated with the boundaries Figure. Examples -------- >>> import seaborn as sns >>> from sklearn.svm import SVC >>> data = sns.load_dataset("iris") >>> # convert the target from string to category to numeric as sklearn cannot handle strings as target >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> clf = SVC(kernel="rbf", gamma=2, C=1, probability=True) >>> _ = plot_decision_boundary(X=X, y=y, clf=clf, title = 'Decision Boundary', legend_title = "Species") """ if X.shape[1] != 2: raise ValueError("X must contains only two features.") if not (pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y) or pd.api.types.is_categorical_dtype(y)): raise TypeError( "The target variable y can only have the following dtype: [int, object, category]." ) label_0 = X.columns.tolist()[0] label_1 = X.columns.tolist()[1] X = X.copy() y = y.copy() X = X.values y = y.astype("category").cat.codes.values # full_col_list = list(sns.color_palette("husl", len(np.unique(y)))) full_col_list = list(sns.color_palette()) if len(np.unique(y)) > len(full_col_list): raise ValueError( "More labels in the data then colors in the color list. Either reduce the number of labels or expend the color list" ) sub_col_list = full_col_list[0:len(np.unique(y))] cmap_bold = ListedColormap(sub_col_list) # Try to include a mapping in a later release (+ show categorical labels in the legend) _ = clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) Z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z_max = Z_proba.max(axis=1) # Take the class with highest probability Z_max = Z_max.reshape(xx.shape) # Put the result into a color plot boundaries, ax = plt.subplots(figsize=figsize) _ = ax.contour(xx, yy, Z, cmap=cmap_bold) _ = ax.scatter(xx, yy, s=(Z_max**2 / h), c=Z, cmap=cmap_bold, alpha=1, edgecolors="none") # Plot also the training points training = ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolors="black") _ = plt.xlim(xx.min(), xx.max()) _ = plt.ylim(yy.min(), yy.max()) _ = plt.title(title) _ = plt.subplots_adjust(right=0.8) _ = plt.xlabel(label_0) _ = plt.ylabel(label_1) # Add legend colors leg1 = plt.legend( *training.legend_elements(), frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 0.5), handlelength=2, handletextpad=1, title=legend_title, ) # Add legend sizes l1 = plt.scatter([], [], c="black", s=0.4**2 / h, edgecolors="none") l2 = plt.scatter([], [], c="black", s=0.6**2 / h, edgecolors="none") l3 = plt.scatter([], [], c="black", s=0.8**2 / h, edgecolors="none") l4 = plt.scatter([], [], c="black", s=1**2 / h, edgecolors="none") labels = ["0.4", "0.6", "0.8", "1"] _ = plt.legend( [l1, l2, l3, l4], labels, frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 1), handlelength=2, handletextpad=1, title="Probabilities", scatterpoints=1, ) _ = plt.gca().add_artist(leg1) return boundaries, ax