def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func)\ -> (float, float, float, float, float): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, random_state=0, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='weighted') buffer_test = preprocessing.minmax_scale(transformed_dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') print('sample_score is done') k5_score = kfold_cv(5, nptraining, nptarget, model) print('k5_score is done') k10_score = kfold_cv(10, nptraining, nptarget, model) print('k10_score is done') k20_score = kfold_cv(20, nptraining, nptarget, model) print('k20_score is done') random_score = random_sampling_cv(nptraining, nptarget, model) return simple_score, k5_score, k10_score, k20_score, random_score
def _train(train_data: DataFrame, classifier: ClassifierMixin, clusterer: Clustering) -> dict: models = dict() train_data = clusterer.cluster_data(train_data) for cluster in range(clusterer.n_clusters): cluster_train_df = train_data[cluster] if not cluster_train_df.empty: cluster_targets_df = DataFrame(cluster_train_df['label']) try: classifier.fit(cluster_train_df.drop('label', 1), cluster_targets_df.values.ravel()) except (NotImplementedError, KeyError): classifier.partial_fit( cluster_train_df.drop('label', 1).values, cluster_targets_df.values.ravel()) except Exception as exception: raise exception models[cluster] = classifier try: classifier = clone(classifier) except TypeError: classifier = clone(classifier, safe=False) classifier.reset() return { ModelType.CLUSTERER.value: clusterer, ModelType.CLASSIFIER.value: models }
def random_sampling_cv(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin) -> float: x_train, x_test, y_train, y_test = model_selection.train_test_split( dataset, answers, shuffle=True, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) f1_score = metrics.f1_score(y_test, prediction, average='weighted') return f1_score
def get_score( model: ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, ) -> int: model.fit(X_train, y_train) score = model.score(X_test, y_test) return score
def test_classifier_without_classes_attribute( estimator: ClassifierMixin) -> None: """ Test that prefitted classifier without 'classes_ 'attribute raises error. """ estimator.fit(X_toy, y_toy) if isinstance(estimator, Pipeline): delattr(estimator[-1], "classes_") else: delattr(estimator, "classes_") mapie = MapieClassifier(estimator=estimator, cv="prefit") with pytest.raises(AttributeError, match=r".*does not contain 'classes_'.*"): mapie.fit(X_toy, y_toy)
def train_model(model: ClassifierMixin, data_time_range: List[str], output_path: str): es_host = ESConnection(es_host='http://localhost:9200') dataset = ml_utils.get_data(start_time=data_time_range[0], end_time=data_time_range[1], es_host=es_host) dataset.to_pickle('data/dataset.pkl') dataset = pd.read_pickle('data/dataset.pkl') print(len(dataset.columns)) y = dataset['target'] X = dataset.drop(columns=['target']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17) print('Training model') model = model.fit(X_train, y_train) print('Finished training') prediction = model.predict(X_test) print(confusion_matrix(y_test, prediction)) dump(model, output_path + '/' + type(model).__name__ + '.joblib')
def ml_cross_val_score( classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight_train: np.ndarray = None, sample_weight_score: np.ndarray = None, scoring: Callable[[np.array, np.array], float] = log_loss): # pylint: disable=invalid-name # pylint: disable=comparison-with-callable """ Advances in Financial Machine Learning, Snippet 7.4, page 110. Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight_train=sample_train, sample_weight_score=sample_score, scoring=accuracy_score) :param classifier: (ClassifierMixin) A sk-learn Classifier object instance. :param X: (pd.DataFrame) The dataset of records to evaluate. :param y: (pd.Series) The labels corresponding to the X dataset. :param cv_gen: (BaseCrossValidator) Cross Validation generator object instance. :param sample_weight_train: (np.array) Sample weights used to train the model for each record in the dataset. :param sample_weight_score: (np.array) Sample weights used to evaluate the model quality. :param scoring: (Callable) A metric scoring, can be custom sklearn metric. :return: (np.array) The computed score. """ # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight_train is None: sample_weight_train = np.ones((X.shape[0],)) if sample_weight_score is None: sample_weight_score = np.ones((X.shape[0],)) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight_train[train]) if scoring == log_loss: prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring(y.iloc[test], prob, sample_weight=sample_weight_score[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring(y.iloc[test], pred, sample_weight=sample_weight_score[test]) ret_scores.append(score) return np.array(ret_scores)
def ml_cross_val_score( classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight: np.ndarray = None, scoring: str = 'neg_log_loss'): # pylint: disable=invalid-name """ Snippet 7.4, page 110, Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss') :param classifier: A sk-learn Classifier object instance. :param X: The dataset of records to evaluate. :param y: The labels corresponding to the X dataset. :param cv_gen: Cross Validation generator object instance. :param sample_weight: A numpy array of weights for each record in the dataset. :param scoring: A metric name to use for scoring; currently supports `neg_log_loss`, `accuracy`, `f1`, `precision`, `recall`, and `roc_auc`. :return: The computed score as a numpy array. """ # Define scoring metrics scoring_func_dict = {'neg_log_loss': log_loss, 'accuracy': accuracy_score, 'f1': f1_score, 'precision': precision_score, 'recall': recall_score, 'roc_auc': roc_auc_score} try: scoring_func = scoring_func_dict[scoring] except KeyError: raise ValueError('Wrong scoring method. Select from: neg_log_loss, accuracy, f1, precision, recall, roc_auc') # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight is None: sample_weight = np.ones((X.shape[0],)) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight[train]) if scoring == 'neg_log_loss': prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring_func(y.iloc[test], prob, sample_weight=sample_weight[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring_func(y.iloc[test], pred, sample_weight=sample_weight[test]) ret_scores.append(score) return np.array(ret_scores)
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func) \ -> (int, int): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, test_size=0.25, random_state=0) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='binary') buffer_test = preprocessing.minmax_scale(dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') k5_score = kfold_cv(5, nptraining, nptarget, model, True) return simple_score, k5_score
def cross_validation(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin, cross_validator: model_selection.BaseCrossValidator, save_worst_data: bool) -> float: iteration_counter: int = 0 f1_score_value = 0 worst_f1_score_value = 1.0 worst_predicted = None worst_actual = None for train_index, test_index in cross_validator.split(dataset, answers): train_x, test_x = dataset[train_index], dataset[test_index] train_y, test_y = answers[train_index], answers[test_index] iteration_counter += 1 # Train model.fit(train_x, train_y) # Test predicted = model.predict(test_x) # Evaluate f1_iteration_score_value = metrics.f1_score(test_y, predicted, average='weighted') if f1_iteration_score_value <= worst_f1_score_value: worst_f1_score_value = f1_iteration_score_value worst_predicted = predicted worst_actual = test_y f1_score_value += f1_iteration_score_value if save_worst_data: np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted) np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual) return f1_score_value / iteration_counter
def train_and_save(classifier: ClassifierMixin, dataset: str, transforms: List[str], bundled: bool, test_proportion: int = 0.1) -> None: """ Trains on the given dataset and saves model. :param classifier: The classifier to train. :param dataset: The dataset to train on. :param transforms: The transforms to apply to the data. :param bundled: Whether to bundle chart classes together. :param test_proportion: What percentage of the dataset to use for testing. :return: None. """ if not make_data(dataset, transforms, bundled): raise FileNotFoundError images, labels = np.load(f"{dataset}/X.npy"), np.load(f"{dataset}/Y.npy") X_train, X_test, Y_train, Y_test = \ train_test_split(images, labels, test_size=test_proportion) classifier.fit(X_train, Y_train) pred = classifier.predict(X_test) print(classification_report(Y_test, pred)) print(pd.DataFrame(confusion_matrix(Y_test, pred))) joblib.dump(classifier, f"{dataset}/model.joblib")
def plot_decision_boundary( X: pd.DataFrame, y: pd.Series, clf: ClassifierMixin = sklearn.linear_model.LogisticRegression(), title: str = "Decision Boundary Logistic Regression", legend_title: str = "Legend", h: float = 0.05, figsize: tuple = (11.7, 8.27), ): """Generate a simple plot of the decision boundary of a classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) Classifier vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) Target relative to X for classification. Datatype should be integers. clf : scikit-learn algorithm An object that has the `predict` and `predict_proba` methods h : int (default: 0.05) Step size in the mesh title : string Title for the plot. legend_title : string Legend title for the plot. figsize: tuple (default: (11.7, 8.27)) Width and height of the figure in inches Returns ------- boundaries: Figure Properties of the figure can be changed later, e.g. use `boundaries.axes[0].set_ylim(0,100)` to change ylim ax: Axes The axes associated with the boundaries Figure. Examples -------- >>> import seaborn as sns >>> from sklearn.svm import SVC >>> data = sns.load_dataset("iris") >>> # convert the target from string to category to numeric as sklearn cannot handle strings as target >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> clf = SVC(kernel="rbf", gamma=2, C=1, probability=True) >>> _ = plot_decision_boundary(X=X, y=y, clf=clf, title = 'Decision Boundary', legend_title = "Species") """ if X.shape[1] != 2: raise ValueError("X must contains only two features.") if not (pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y) or pd.api.types.is_categorical_dtype(y)): raise TypeError( "The target variable y can only have the following dtype: [int, object, category]." ) label_0 = X.columns.tolist()[0] label_1 = X.columns.tolist()[1] X = X.copy() y = y.copy() X = X.values y = y.astype("category").cat.codes.values # full_col_list = list(sns.color_palette("husl", len(np.unique(y)))) full_col_list = list(sns.color_palette()) if len(np.unique(y)) > len(full_col_list): raise ValueError( "More labels in the data then colors in the color list. Either reduce the number of labels or expend the color list" ) sub_col_list = full_col_list[0:len(np.unique(y))] cmap_bold = ListedColormap(sub_col_list) # Try to include a mapping in a later release (+ show categorical labels in the legend) _ = clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) Z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z_max = Z_proba.max(axis=1) # Take the class with highest probability Z_max = Z_max.reshape(xx.shape) # Put the result into a color plot boundaries, ax = plt.subplots(figsize=figsize) _ = ax.contour(xx, yy, Z, cmap=cmap_bold) _ = ax.scatter(xx, yy, s=(Z_max**2 / h), c=Z, cmap=cmap_bold, alpha=1, edgecolors="none") # Plot also the training points training = ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolors="black") _ = plt.xlim(xx.min(), xx.max()) _ = plt.ylim(yy.min(), yy.max()) _ = plt.title(title) _ = plt.subplots_adjust(right=0.8) _ = plt.xlabel(label_0) _ = plt.ylabel(label_1) # Add legend colors leg1 = plt.legend( *training.legend_elements(), frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 0.5), handlelength=2, handletextpad=1, title=legend_title, ) # Add legend sizes l1 = plt.scatter([], [], c="black", s=0.4**2 / h, edgecolors="none") l2 = plt.scatter([], [], c="black", s=0.6**2 / h, edgecolors="none") l3 = plt.scatter([], [], c="black", s=0.8**2 / h, edgecolors="none") l4 = plt.scatter([], [], c="black", s=1**2 / h, edgecolors="none") labels = ["0.4", "0.6", "0.8", "1"] _ = plt.legend( [l1, l2, l3, l4], labels, frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 1), handlelength=2, handletextpad=1, title="Probabilities", scatterpoints=1, ) _ = plt.gca().add_artist(leg1) return boundaries, ax