def train_model(model: ClassifierMixin, data_time_range: List[str], output_path: str): es_host = ESConnection(es_host='http://localhost:9200') dataset = ml_utils.get_data(start_time=data_time_range[0], end_time=data_time_range[1], es_host=es_host) dataset.to_pickle('data/dataset.pkl') dataset = pd.read_pickle('data/dataset.pkl') print(len(dataset.columns)) y = dataset['target'] X = dataset.drop(columns=['target']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17) print('Training model') model = model.fit(X_train, y_train) print('Finished training') prediction = model.predict(X_test) print(confusion_matrix(y_test, prediction)) dump(model, output_path + '/' + type(model).__name__ + '.joblib')
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func)\ -> (float, float, float, float, float): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, random_state=0, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='weighted') buffer_test = preprocessing.minmax_scale(transformed_dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') print('sample_score is done') k5_score = kfold_cv(5, nptraining, nptarget, model) print('k5_score is done') k10_score = kfold_cv(10, nptraining, nptarget, model) print('k10_score is done') k20_score = kfold_cv(20, nptraining, nptarget, model) print('k20_score is done') random_score = random_sampling_cv(nptraining, nptarget, model) return simple_score, k5_score, k10_score, k20_score, random_score
def __init__(self, estimator=None, max_depth=20, min_samples_split=2, min_samples_leaf=2, min_weight_fraction_leaf=0.0, fit_improve_algo='auto', p1p2=0.09, gamma=1., verbose=0): "constructor" ClassifierMixin.__init__(self) BaseEstimator.__init__(self) # logistic regression if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator if max_depth is None: raise ValueError("'max_depth' cannot be None.") if max_depth > 1024: raise ValueError("'max_depth' must be <= 1024.") self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.fit_improve_algo = fit_improve_algo self.p1p2 = p1p2 self.gamma = gamma self.verbose = verbose if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values: raise ValueError("fit_improve_algo='{}' not in {}".format( self.fit_improve_algo, DecisionTreeLogisticRegression._fit_improve_algo_values))
def __init__(self, estimator=None, clus=None, **kwargs): """ @param estimator :epkg:`sklearn:linear_model:LogisiticRegression` by default @param clus clustering applied on each class, by default k-means with two classes @param kwargs sent to :meth:`set_params <mlinsights.mlmodel.classification_kmeans. ClassifierAfterKMeans.set_params>`, see its documentation to understand how to specify parameters """ ClassifierMixin.__init__(self) BaseEstimator.__init__(self) if estimator is None: estimator = LogisticRegression() if clus is None: clus = KMeans(n_clusters=2) self.estimator = estimator self.clus = clus if not hasattr(clus, "transform"): raise AttributeError( # pragma: no cover "clus does not have a transform method.") if kwargs: self.set_params(**kwargs)
def __init__(self, estimator=None, threshold=0.75): ClassifierMixin.__init__(self) BaseEstimator.__init__(self) if estimator is None: estimator = LogisticRegression(solver='liblinear') self.estimator = estimator self.threshold = threshold
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) # clusters self.clusters = None # number of unique clusters self.n_clusters = None # clusters where no match has been found self.nomatch = None # clusters where all elements are positive matches self.allmatch = None # clusters where there is positive and negative values (matche and non-match) self.mixedmatch = None # Clusters not found (added in no matc) self.notfound = None self.fitted = False pass
def __init__(self, transformer, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ Args: transformer (TransformerMixin): classifier (ClassifierMixin): ixname (str): source_suffix (str): target_suffix (str): n_jobs (int): pruning_ths (float): return only the pairs which have a score greater than the store_ths """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.transformer = transformer self.classifier = classifier pass
def __init__(self, binner=None, estimator=None, n_jobs=None, random_state=None, verbose=False): """ @param binner transformer or predictor which creates the buckets @param estimator predictor trained on every bucket @param n_jobs number of parallel jobs (for training and predicting) @param random_state to pick up random examples when buckets do not contain enough examples of each class @param verbose boolean or use ``'tqdm'`` to use :epkg:`tqdm` to fit the estimators *binner* allows the following values: - ``tree``: the model is :epkg:`sklearn:tree:DecisionTreeClassifier` - ``'bins'``: the model :epkg:`sklearn:preprocessing:KBinsDiscretizer` - any instanciated model *estimator* allows the following values: - ``None``: the model is :epkg:`sklearn:linear_model:LogisticRegression` - any instanciated model """ if estimator is None: estimator = LogisticRegression() if binner in ('tree', None): binner = DecisionTreeClassifier(min_samples_leaf=5) ClassifierMixin.__init__(self) PiecewiseEstimator.__init__( self, binner=binner, estimator=estimator, n_jobs=n_jobs, verbose=verbose) self.random_state = random_state
def __init__(self, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ This is a wrapper around a classifier that allows it to train on partial data where X and y do not have the same index, (because of pruning steps,...) It will train (fit) the classifier on the common index Args: classifier (ClassifierMixin): Classifier to use. Should be the output of the pipeline ixname (str): source_suffix (str): target_suffix (str): """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.classifier = classifier pass
def __init__(self, connector, pruningclf, sbsmodel, classifier, ixname='ix', source_suffix='source', target_suffix='target', **kwargs): """ Args: connector (ConnectorMixin): Connector (Scorer) used to do the calculation, pruningclf (Explorer): Classifier used to do the pruning (0=no match, 1: potential match, 2: sure match) sbsmodel (TransformerMixin): Side-by-Side scorer, Can be FeatureUnion, Pipeline... classifier (ClassifierMixin): Classifier used to do the prediction ixname (str): 'ix' source_suffix (str): 'left' target_suffix (str): 'right' """ ClassifierMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.fitted = False self.connector = connector self.pruningclf = pruningclf self.sbsmodel = sbsmodel self.classifier = classifier pass
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series, mapper_dict: Dict) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def random_sampling_cv(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin) -> float: x_train, x_test, y_train, y_test = model_selection.train_test_split( dataset, answers, shuffle=True, stratify=answers) model.fit(x_train, y_train) prediction = model.predict(x_test) f1_score = metrics.f1_score(y_test, prediction, average='weighted') return f1_score
def get_score( model: ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, ) -> int: model.fit(X_train, y_train) score = model.score(X_test, y_test) return score
def test_classifier_without_classes_attribute( estimator: ClassifierMixin) -> None: """ Test that prefitted classifier without 'classes_ 'attribute raises error. """ estimator.fit(X_toy, y_toy) if isinstance(estimator, Pipeline): delattr(estimator[-1], "classes_") else: delattr(estimator, "classes_") mapie = MapieClassifier(estimator=estimator, cv="prefit") with pytest.raises(AttributeError, match=r".*does not contain 'classes_'.*"): mapie.fit(X_toy, y_toy)
def _predict_oof_model( self, estimator: ClassifierMixin, X: ArrayLike, ) -> NDArray: """ Predict probabilities of a test set from a fitted estimator. Parameters ---------- estimator : ClassifierMixin Fitted estimator. X : ArrayLike Test set. Returns ------- ArrayLike Predicted probabilities. """ y_pred_proba = estimator.predict_proba(X) # we enforce y_pred_proba to contain all labels included y if len(estimator.classes_) != self.n_classes_: y_pred_proba = self._fix_number_of_classes( estimator.classes_, y_pred_proba ) return y_pred_proba
def test_probabilities(model: ClassifierMixin, X: np.array, y: pd.Series, bins: int = 10, threshold: float = 0.5): """Print confusion matrix based on class probability.""" probs = [p[1] for p in model.predict_proba(X)] print('\tProbabilities') df = pd.DataFrame({'prob': probs, 'label': y}) step = 1 / bins cut_labels = [round(step * f, 1) for f in range(10)] by_prob = (df.groupby(pd.cut(df['prob'], bins, labels=cut_labels)).agg(['sum', 'count'])['label']) print('\t\tprobs\t1\t0\tacc') for index, row in by_prob.iloc[::-1].iterrows(): ones = row['sum'] if math.isnan(ones): ones = 0 else: ones = int(ones) count = row['count'] zeros = int(count) - ones if count > 0: acc = zeros / count if index < threshold else ones / count else: acc = 0.0 print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
def ml_cross_val_score( classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight_train: np.ndarray = None, sample_weight_score: np.ndarray = None, scoring: Callable[[np.array, np.array], float] = log_loss): # pylint: disable=invalid-name # pylint: disable=comparison-with-callable """ Advances in Financial Machine Learning, Snippet 7.4, page 110. Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight_train=sample_train, sample_weight_score=sample_score, scoring=accuracy_score) :param classifier: (ClassifierMixin) A sk-learn Classifier object instance. :param X: (pd.DataFrame) The dataset of records to evaluate. :param y: (pd.Series) The labels corresponding to the X dataset. :param cv_gen: (BaseCrossValidator) Cross Validation generator object instance. :param sample_weight_train: (np.array) Sample weights used to train the model for each record in the dataset. :param sample_weight_score: (np.array) Sample weights used to evaluate the model quality. :param scoring: (Callable) A metric scoring, can be custom sklearn metric. :return: (np.array) The computed score. """ # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight_train is None: sample_weight_train = np.ones((X.shape[0],)) if sample_weight_score is None: sample_weight_score = np.ones((X.shape[0],)) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight_train[train]) if scoring == log_loss: prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring(y.iloc[test], prob, sample_weight=sample_weight_score[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring(y.iloc[test], pred, sample_weight=sample_weight_score[test]) ret_scores.append(score) return np.array(ret_scores)
def sklearn_evaluator( X_test: np.ndarray, y_test: np.ndarray, model: ClassifierMixin, ) -> float: """Calculate accuracy score with classifier.""" test_acc = model.score(X_test.reshape((X_test.shape[0], -1)), y_test) return test_acc
def ml_cross_val_score( classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight: np.ndarray = None, scoring: str = 'neg_log_loss'): # pylint: disable=invalid-name """ Snippet 7.4, page 110, Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss') :param classifier: A sk-learn Classifier object instance. :param X: The dataset of records to evaluate. :param y: The labels corresponding to the X dataset. :param cv_gen: Cross Validation generator object instance. :param sample_weight: A numpy array of weights for each record in the dataset. :param scoring: A metric name to use for scoring; currently supports `neg_log_loss`, `accuracy`, `f1`, `precision`, `recall`, and `roc_auc`. :return: The computed score as a numpy array. """ # Define scoring metrics scoring_func_dict = {'neg_log_loss': log_loss, 'accuracy': accuracy_score, 'f1': f1_score, 'precision': precision_score, 'recall': recall_score, 'roc_auc': roc_auc_score} try: scoring_func = scoring_func_dict[scoring] except KeyError: raise ValueError('Wrong scoring method. Select from: neg_log_loss, accuracy, f1, precision, recall, roc_auc') # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight is None: sample_weight = np.ones((X.shape[0],)) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight[train]) if scoring == 'neg_log_loss': prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring_func(y.iloc[test], prob, sample_weight=sample_weight[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring_func(y.iloc[test], pred, sample_weight=sample_weight[test]) ret_scores.append(score) return np.array(ret_scores)
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func) \ -> (int, int): selecter = feature_selection.SelectKBest(score_func=score_func, k=parametrs) selecter.fit(dataset, answers) transformed_dataset = selecter.transform(dataset) x_train, x_test, y_train, y_test = model_selection.train_test_split( transformed_dataset, answers, test_size=0.25, random_state=0) model.fit(x_train, y_train) prediction = model.predict(x_test) simple_score = metrics.f1_score(y_test, prediction, average='binary') buffer_test = preprocessing.minmax_scale(dataset, feature_range=(0, 1), axis=0) nptraining = np.array(buffer_test, 'float32') nptarget = np.array(answers, 'float32') k5_score = kfold_cv(5, nptraining, nptarget, model, True) return simple_score, k5_score
def evaluate_on_datasets(predictor: ClassifierMixin, datasets): y_preds = [] mean_kappa = [] for i, (x, y_true) in enumerate(datasets): y_pred = predictor.predict(x) y_preds.append(y_pred) kappa_hold = cohen_kappa_score(y_true, y_pred, weights='quadratic') mean_kappa.append(kappa_hold) print(np.mean(mean_kappa), mean_kappa) return y_preds
def _train(train_data: DataFrame, classifier: ClassifierMixin, clusterer: Clustering) -> dict: models = dict() train_data = clusterer.cluster_data(train_data) for cluster in range(clusterer.n_clusters): cluster_train_df = train_data[cluster] if not cluster_train_df.empty: cluster_targets_df = DataFrame(cluster_train_df['label']) try: classifier.fit(cluster_train_df.drop('label', 1), cluster_targets_df.values.ravel()) except (NotImplementedError, KeyError): classifier.partial_fit( cluster_train_df.drop('label', 1).values, cluster_targets_df.values.ravel()) except Exception as exception: raise exception models[cluster] = classifier try: classifier = clone(classifier) except TypeError: classifier = clone(classifier, safe=False) classifier.reset() return { ModelType.CLUSTERER.value: clusterer, ModelType.CLASSIFIER.value: models }
def cross_validation(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin, cross_validator: model_selection.BaseCrossValidator, save_worst_data: bool) -> float: iteration_counter: int = 0 f1_score_value = 0 worst_f1_score_value = 1.0 worst_predicted = None worst_actual = None for train_index, test_index in cross_validator.split(dataset, answers): train_x, test_x = dataset[train_index], dataset[test_index] train_y, test_y = answers[train_index], answers[test_index] iteration_counter += 1 # Train model.fit(train_x, train_y) # Test predicted = model.predict(test_x) # Evaluate f1_iteration_score_value = metrics.f1_score(test_y, predicted, average='weighted') if f1_iteration_score_value <= worst_f1_score_value: worst_f1_score_value = f1_iteration_score_value worst_predicted = predicted worst_actual = test_y f1_score_value += f1_iteration_score_value if save_worst_data: np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted) np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual) return f1_score_value / iteration_counter
def train_and_save(classifier: ClassifierMixin, dataset: str, transforms: List[str], bundled: bool, test_proportion: int = 0.1) -> None: """ Trains on the given dataset and saves model. :param classifier: The classifier to train. :param dataset: The dataset to train on. :param transforms: The transforms to apply to the data. :param bundled: Whether to bundle chart classes together. :param test_proportion: What percentage of the dataset to use for testing. :return: None. """ if not make_data(dataset, transforms, bundled): raise FileNotFoundError images, labels = np.load(f"{dataset}/X.npy"), np.load(f"{dataset}/Y.npy") X_train, X_test, Y_train, Y_test = \ train_test_split(images, labels, test_size=test_proportion) classifier.fit(X_train, Y_train) pred = classifier.predict(X_test) print(classification_report(Y_test, pred)) print(pd.DataFrame(confusion_matrix(Y_test, pred))) joblib.dump(classifier, f"{dataset}/model.joblib")
def bootstrap_accuracy( f: ClassifierMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2 ** 32 - 1), ) -> List[float]: """ Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ return bootstrap_measure( f, X, y, num_samples=num_samples, random_state=random_state, predict=lambda f, X: f.predict(X), measure=accuracy_score, )
def bootstrap_accuracy( f: ClassifierMixin, X, # numpy array y, # numpy array num_samples: int = 100, random_state: int = random.randint(0, 2**32 - 1), ) -> List[float]: """ Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``. Generate ``num_samples`` samples; and seed the resampler with ``random_state``. """ dist: List[float] = [] y_pred = f.predict(X) # type:ignore (predict not on ClassifierMixin) # do the bootstrap: for trial in range(num_samples): sample_pred, sample_truth = resample(y_pred, y, random_state=trial + random_state) # type:ignore score = accuracy_score(y_true=sample_truth, y_pred=sample_pred) # type:ignore dist.append(score) return dist
def __init__(self): ClassifierMixin.__init__(self) self.clasificadores = [RandomForest(),Boosting(),Gradient(),SVM(),SVM2()]# ,Bagging()]
def __init__(self): ClassifierMixin.__init__(self) BaseEstimator.__init__(self)
def score(self, X: np.ndarray, y: np.ndarray, sample_weight: Optional[np.ndarray] = None) -> float: X, y = self._validate_input(X, y) return ClassifierMixin.score(self, X, y, sample_weight)
def __init__(self, base_estimator): ClassifierMixin.__init__(self) BaseEstimator.__init__(self) self.base_estimator = base_estimator
def __init__(self, penalty="l1"): BaseEstimator.__init__(self) ClassifierMixin.__init__(self) self.penalty = penalty self.estimator = LogisticRegression(penalty=self.penalty, solver="liblinear")