Пример #1
0
def train_model(model: ClassifierMixin, data_time_range: List[str],
                output_path: str):
    es_host = ESConnection(es_host='http://localhost:9200')

    dataset = ml_utils.get_data(start_time=data_time_range[0],
                                end_time=data_time_range[1],
                                es_host=es_host)
    dataset.to_pickle('data/dataset.pkl')
    dataset = pd.read_pickle('data/dataset.pkl')
    print(len(dataset.columns))

    y = dataset['target']
    X = dataset.drop(columns=['target'])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=17)
    print('Training model')
    model = model.fit(X_train, y_train)
    print('Finished training')
    prediction = model.predict(X_test)
    print(confusion_matrix(y_test, prediction))

    dump(model, output_path + '/' + type(model).__name__ + '.joblib')
Пример #2
0
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series,
                     mapper_dict: Dict) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
Пример #3
0
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func)\
        -> (float, float, float, float, float):
    selecter = feature_selection.SelectKBest(score_func=score_func,
                                             k=parametrs)
    selecter.fit(dataset, answers)
    transformed_dataset = selecter.transform(dataset)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        transformed_dataset, answers, random_state=0, stratify=answers)

    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    simple_score = metrics.f1_score(y_test, prediction, average='weighted')

    buffer_test = preprocessing.minmax_scale(transformed_dataset,
                                             feature_range=(0, 1),
                                             axis=0)
    nptraining = np.array(buffer_test, 'float32')
    nptarget = np.array(answers, 'float32')
    print('sample_score is done')
    k5_score = kfold_cv(5, nptraining, nptarget, model)
    print('k5_score is done')
    k10_score = kfold_cv(10, nptraining, nptarget, model)
    print('k10_score is done')
    k20_score = kfold_cv(20, nptraining, nptarget, model)
    print('k20_score is done')
    random_score = random_sampling_cv(nptraining, nptarget, model)
    return simple_score, k5_score, k10_score, k20_score, random_score
Пример #4
0
def random_sampling_cv(dataset: np.ndarray, answers: np.ndarray,
                       model: base.ClassifierMixin) -> float:
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        dataset, answers, shuffle=True, stratify=answers)
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)

    f1_score = metrics.f1_score(y_test, prediction, average='weighted')
    return f1_score
Пример #5
0
def evaluate_on_datasets(predictor: ClassifierMixin, datasets):
    y_preds = []
    mean_kappa = []
    for i, (x, y_true) in enumerate(datasets):
        y_pred = predictor.predict(x)
        y_preds.append(y_pred)

        kappa_hold = cohen_kappa_score(y_true, y_pred, weights='quadratic')
        mean_kappa.append(kappa_hold)

    print(np.mean(mean_kappa), mean_kappa)
    return y_preds
Пример #6
0
def get_score(dataset: np.array, answers: np.array, parametrs: int, model: base.ClassifierMixin, score_func) \
        -> (int, int):
    selecter = feature_selection.SelectKBest(score_func=score_func,
                                             k=parametrs)
    selecter.fit(dataset, answers)
    transformed_dataset = selecter.transform(dataset)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        transformed_dataset, answers, test_size=0.25, random_state=0)

    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    simple_score = metrics.f1_score(y_test, prediction, average='binary')

    buffer_test = preprocessing.minmax_scale(dataset,
                                             feature_range=(0, 1),
                                             axis=0)
    nptraining = np.array(buffer_test, 'float32')
    nptarget = np.array(answers, 'float32')
    k5_score = kfold_cv(5, nptraining, nptarget, model, True)
    return simple_score, k5_score
Пример #7
0
def bootstrap_accuracy(
    f: ClassifierMixin,
    X,  # numpy array
    y,  # numpy array
    num_samples: int = 100,
    random_state: int = random.randint(0, 2 ** 32 - 1),
) -> List[float]:
    """
    Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``.
    Generate ``num_samples`` samples; and seed the resampler with ``random_state``.
    """
    return bootstrap_measure(
        f,
        X,
        y,
        num_samples=num_samples,
        random_state=random_state,
        predict=lambda f, X: f.predict(X),
        measure=accuracy_score,
    )
Пример #8
0
def bootstrap_accuracy(
        f: ClassifierMixin,
        X,  # numpy array
        y,  # numpy array
        num_samples: int = 100,
        random_state: int = random.randint(0, 2**32 - 1),
) -> List[float]:
    """
    Take the classifier ``f``, and compute it's bootstrapped accuracy over the dataset ``X``,``y``.
    Generate ``num_samples`` samples; and seed the resampler with ``random_state``.
    """
    dist: List[float] = []
    y_pred = f.predict(X)  # type:ignore (predict not on ClassifierMixin)
    # do the bootstrap:
    for trial in range(num_samples):
        sample_pred, sample_truth = resample(y_pred,
                                             y,
                                             random_state=trial +
                                             random_state)  # type:ignore
        score = accuracy_score(y_true=sample_truth,
                               y_pred=sample_pred)  # type:ignore
        dist.append(score)
    return dist
Пример #9
0
def cross_validation(dataset: np.ndarray, answers: np.ndarray,
                     model: base.ClassifierMixin,
                     cross_validator: model_selection.BaseCrossValidator,
                     save_worst_data: bool) -> float:
    iteration_counter: int = 0
    f1_score_value = 0
    worst_f1_score_value = 1.0
    worst_predicted = None
    worst_actual = None

    for train_index, test_index in cross_validator.split(dataset, answers):
        train_x, test_x = dataset[train_index], dataset[test_index]
        train_y, test_y = answers[train_index], answers[test_index]
        iteration_counter += 1

        # Train
        model.fit(train_x, train_y)

        # Test
        predicted = model.predict(test_x)

        # Evaluate
        f1_iteration_score_value = metrics.f1_score(test_y,
                                                    predicted,
                                                    average='weighted')
        if f1_iteration_score_value <= worst_f1_score_value:
            worst_f1_score_value = f1_iteration_score_value
            worst_predicted = predicted
            worst_actual = test_y

        f1_score_value += f1_iteration_score_value

    if save_worst_data:
        np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted)
        np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual)

    return f1_score_value / iteration_counter
Пример #10
0
def train_and_save(classifier: ClassifierMixin,
                   dataset: str,
                   transforms: List[str],
                   bundled: bool,
                   test_proportion: int = 0.1) -> None:
    """
    Trains on the given dataset and saves model.
    :param classifier: The classifier to train.
    :param dataset: The dataset to train on.
    :param transforms: The transforms to apply to the data.
    :param bundled: Whether to bundle chart classes together.
    :param test_proportion: What percentage of the dataset to use for testing.
    :return: None.
    """
    if not make_data(dataset, transforms, bundled):
        raise FileNotFoundError
    images, labels = np.load(f"{dataset}/X.npy"), np.load(f"{dataset}/Y.npy")
    X_train, X_test, Y_train, Y_test = \
        train_test_split(images, labels, test_size=test_proportion)
    classifier.fit(X_train, Y_train)
    pred = classifier.predict(X_test)
    print(classification_report(Y_test, pred))
    print(pd.DataFrame(confusion_matrix(Y_test, pred)))
    joblib.dump(classifier, f"{dataset}/model.joblib")
Пример #11
0
def plot_decision_boundary(
        X: pd.DataFrame,
        y: pd.Series,
        clf: ClassifierMixin = sklearn.linear_model.LogisticRegression(),
        title: str = "Decision Boundary Logistic Regression",
        legend_title: str = "Legend",
        h: float = 0.05,
        figsize: tuple = (11.7, 8.27),
):
    """Generate a simple plot of the decision boundary of a classifier.
    
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Classifier vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples)
        Target relative to X for classification. Datatype should be integers.
    clf : scikit-learn algorithm
        An object that has the `predict` and `predict_proba` methods
    h : int (default: 0.05)
        Step size in the mesh
    title : string
        Title for the plot.
    legend_title : string
        Legend title for the plot.
    figsize: tuple (default: (11.7, 8.27))
        Width and height of the figure in inches
    
    Returns
    -------
    boundaries: Figure
        Properties of the figure can be changed later, e.g. use `boundaries.axes[0].set_ylim(0,100)` to change ylim
    ax: Axes
        The axes associated with the boundaries Figure.
    
    Examples
    --------
    >>> import seaborn as sns
    >>> from sklearn.svm import SVC
    >>> data = sns.load_dataset("iris")
    >>> # convert the target from string to category to numeric as sklearn cannot handle strings as target
    >>> y = data["species"]
    >>> X = data[["sepal_length", "sepal_width"]]
    >>> clf = SVC(kernel="rbf", gamma=2, C=1, probability=True)
    >>> _ = plot_decision_boundary(X=X, y=y, clf=clf, title = 'Decision Boundary', legend_title = "Species")

    """

    if X.shape[1] != 2:
        raise ValueError("X must contains only two features.")

    if not (pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y)
            or pd.api.types.is_categorical_dtype(y)):
        raise TypeError(
            "The target variable y can only have the following dtype: [int, object, category]."
        )

    label_0 = X.columns.tolist()[0]
    label_1 = X.columns.tolist()[1]

    X = X.copy()
    y = y.copy()

    X = X.values
    y = y.astype("category").cat.codes.values

    #     full_col_list = list(sns.color_palette("husl", len(np.unique(y))))
    full_col_list = list(sns.color_palette())

    if len(np.unique(y)) > len(full_col_list):
        raise ValueError(
            "More labels in the data then colors in the color list. Either reduce the number of labels or expend the color list"
        )

    sub_col_list = full_col_list[0:len(np.unique(y))]
    cmap_bold = ListedColormap(sub_col_list)

    # Try to include a mapping in a later release (+ show categorical labels in the legend)

    _ = clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    Z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z_max = Z_proba.max(axis=1)  # Take the class with highest probability
    Z_max = Z_max.reshape(xx.shape)

    # Put the result into a color plot
    boundaries, ax = plt.subplots(figsize=figsize)
    _ = ax.contour(xx, yy, Z, cmap=cmap_bold)
    _ = ax.scatter(xx,
                   yy,
                   s=(Z_max**2 / h),
                   c=Z,
                   cmap=cmap_bold,
                   alpha=1,
                   edgecolors="none")

    # Plot also the training points
    training = ax.scatter(X[:, 0],
                          X[:, 1],
                          c=y,
                          cmap=cmap_bold,
                          edgecolors="black")
    _ = plt.xlim(xx.min(), xx.max())
    _ = plt.ylim(yy.min(), yy.max())
    _ = plt.title(title)
    _ = plt.subplots_adjust(right=0.8)
    _ = plt.xlabel(label_0)
    _ = plt.ylabel(label_1)

    # Add legend colors
    leg1 = plt.legend(
        *training.legend_elements(),
        frameon=False,
        fontsize=12,
        borderaxespad=0,
        bbox_to_anchor=(1, 0.5),
        handlelength=2,
        handletextpad=1,
        title=legend_title,
    )

    # Add legend sizes
    l1 = plt.scatter([], [], c="black", s=0.4**2 / h, edgecolors="none")
    l2 = plt.scatter([], [], c="black", s=0.6**2 / h, edgecolors="none")
    l3 = plt.scatter([], [], c="black", s=0.8**2 / h, edgecolors="none")
    l4 = plt.scatter([], [], c="black", s=1**2 / h, edgecolors="none")

    labels = ["0.4", "0.6", "0.8", "1"]
    _ = plt.legend(
        [l1, l2, l3, l4],
        labels,
        frameon=False,
        fontsize=12,
        borderaxespad=0,
        bbox_to_anchor=(1, 1),
        handlelength=2,
        handletextpad=1,
        title="Probabilities",
        scatterpoints=1,
    )
    _ = plt.gca().add_artist(leg1)

    return boundaries, ax