Пример #1
0
def find_best_treshold(y_true,
                       y_pred,
                       metrics=None,
                       classes=None,
                       scale=100,
                       verbose=False):

    if classes:
        classes = to_array(classes)
    if not isinstance(metrics, list):
        metrics = [metrics]
    if len(y_true.shape) == 1:
        y_true = to_array(y_true)
        y_true = y_true.reshape(-1, 1)
    else:
        y_true = to_ndarray(y_true)
    if len(y_pred.shape) == 1:
        y_pred = to_array(y_pred)
        y_pred = y_pred.reshape(-1, 1)
    else:
        y_pred = to_ndarray(y_pred)
    assert y_true.shape == y_pred.shape, f"y_true and y_pred must have the same dimension, currently: mismatch {y_true.shape} {y_pred.shape}"
    if classes:
        assert classes.shape[0] == y_pred.shape[
            1], f"classes and y_true/y_pred columns must have same dimensionality: mismatch classes {classes.shape[0]}, y_pred {y_pred.shape[1]}"
    else:
        classes = np.arange(y_pred.shape[1])

    def search(y_true: np.array, y_pred: np.array, scale: int,
               metric_function):
        # y_true and y_pred as vectors
        best_score = 0
        best_threshold = 0
        for t in range(scale):
            score = metric_function(
                y_true, np.array(y_pred > float(t / scale), dtype=np.uint8))
            if score > best_score:
                best_threshold = float(t / scale)
                best_score = score
        return best_threshold, best_score

    threhsolds_dict = {}
    for j in range(y_true.shape[1]):
        for metric in metrics:
            best_threshold, best_score = search(y_true[:, j],
                                                y_pred[:, j],
                                                scale=scale,
                                                metric_function=metric)
            if verbose:
                print(
                    f"Class {classes[j]} : best score = {best_score} with threshold = {best_threshold}"
                )
            threhsolds_dict[classes[j]] = {
                "best_score": best_score,
                "best_threshold": best_threshold
            }
    return threhsolds_dict
Пример #2
0
def plot_grouped_bar(data,
                     labels,
                     series_names,
                     width=0.35,
                     y_label=None,
                     title=None):
    if isinstance(data, list):
        data = np.array(data)
    if isinstance(data, pd.DataFrame):
        data = data.values
    labels = to_array(labels)
    series_names = to_array(series_names)
    assert len(data.shape) == 2, "Input data must be a 2-dimensional array"
    assert data.shape[1] == labels.shape[
        0], "Labels and data columns must have same dimensionality"
    assert data.shape[0] == series_names.shape[
        0], "Series_names and data rows must have same dimensionality"
    x = np.arange(labels.shape[0])  # the label locations

    fig, ax = plt.subplots()
    rectangles = []
    for i in range(data.shape[0]):
        rectangles.append(
            ax.bar(x - i * width / data.shape[1],
                   data[i, :],
                   width / data.shape[1],
                   label=series_names[i]))

    # Add some text for labels, title and custom x-axis tick labels, etc.
    if y_label:
        ax.set_ylabel(y_label)
    if title:
        ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    def autolabel(rects):

        for rect in rects:
            height = rect.get_height()
            ax.annotate(
                '{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center',
                va='bottom')

    #for rectangle in rectangles:
    #autolabel(rectangle)

    fig.tight_layout()

    plt.show()
Пример #3
0
def to_classes(x,
               n_classes=2,
               min_value=None,
               max_value=None,
               agg_function=None,
               verbose=False):
    x = to_array(x)
    n_classes = int(n_classes)
    if not min_value:
        min_value = np.min(x)
    if not max_value:
        max_value = np.max(x)
    if verbose:
        print(f"Value max: {min_value}")
        print(f"Value min: {max_value}")
    if not agg_function:
        agg_function = np.mean

    step = (max_value - min_value) / n_classes
    classes_array = np.zeros((x.shape[0], ))
    for j in range(n_classes):
        if verbose:
            print(f"Interval lower bound: {min_value + j * step}")
            print(f"Interval upper bound: {min_value + (j + 1) * step}")
        if j < n_classes - 1:
            idx = np.where((0 <= x - min_value - j * step)
                           & (x - min_value - j * step < step))[0]
        else:
            # if last chunk, take inferior or equal (instead of strictly inferio) to max value
            # n_steps * steps =/= max_value because of float approximation
            idx = np.where((0 <= x - min_value - j * step)
                           & (x <= max_value))[0]
        classes_array[idx] = agg_function(x[idx])
    return classes_array
Пример #4
0
def plot_histo(x, bins=100, title=None):
    x = to_array(x)

    fig, ax = plt.subplots()
    ax.set_title(title)
    plot.plt(x)
    plt.show()
Пример #5
0
def plot_multiscatter(df: pd.DataFrame,
                      features_x,
                      features_y,
                      within_x=False,
                      within_y=False):
    features_x = to_array(features_x)
    features_y = to_array(features_y)

    for x, y in itertools.product(features_x, features_y):
        plot_scatter(x=df[x], y=df[y])

    if within_x:
        plot_multiscatter(df, features_x, features_x)

    if within_y:
        plot_multiscatter(df, features_y, features_y)
Пример #6
0
def plot_densities(data, labels=None):
    data = to_ndarray(data)
    if labels:
        labels = to_array(labels)
        assert data.shape[1] == labels.shape[
            0], "Data columsn and labels must have same dimensionality"

    for j in range(data.shape[1]):
        sns.distplot(data[:, j], label=labels[j])
    plt.legend()
    plt.show()
Пример #7
0
def plot_multiline(df: pd.DataFrame, x_column, labels=None, scale=False):
    if isinstance(x_column, str):
        x_points = df[x_column]
    else:
        x_points = to_array(x_column)

    if labels:
        labels = to_array(labels)
        assert df.shape[1] == labels.shape[
            0], "Data columns and labels must have same dimensionality"
    if isinstance(df, pd.Series):
        _df = pd.DataFrame()
        _df['_'] = df.values
        df = _df
    for col in df.columns:
        if scale:
            y = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
        else:
            y = df[col]
        sns.lineplot(x=x_points, y=y)
    plt.legend()
    plt.show()
Пример #8
0
def plot_mirrorline(df, x_column, scale=False):
    if isinstance(x_column, str):
        x_points = df[x_column]
    else:
        x_points = to_array(x_column)
    assert df.shape[1] == 2, "Dataframe must have two columns"
    upper_line = df.columns[0]
    lower_line = df.columns[1]
    if scale:
        for col in [upper_line, lower_line]:
            df[col] = (df[col] - df[col].min()) / (df[col].max() -
                                                   df[col].min())
    # Ensure lines are positive
    df[upper_line] = df[upper_line] - min(df[upper_line].min(), 0)
    df[lower_line] = -(df[lower_line] - min(df[lower_line].min(), 0))

    sns.lineplot(y=df[upper_line], x=x_points)
    sns.lineplot(y=df[lower_line], x=x_points)

    plt.legend()
    plt.show()