Пример #1
0
def graph_histogram_classification(dataset, round_id, y, part='eval'):
    """
    generate the histogram of predictions

    :param dataset: dataset object
    :param round_id: id of the round (model)
    :param y: prediction values
    :param part: set (eval / train set)
    :return: None
    """
    try:
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                plt.figure(figsize=(6, 6))
                for i, name in enumerate(dataset.y_class_names):
                    sns.distplot(y[:, i], hist=False, label=name)
                plt.title('histogram of probabilities (%s set)' % part)
                plt.xlabel('values')
                plt.ylabel('frequencies')
                plt.legend()
                __save_fig(dataset.dataset_id, 'hist_%s_%s' % (part, round_id),
                           dark)
    except:
        log.error(
            'error in graph_histogram_classification with dataset_id %s' %
            dataset.dataset_id)
Пример #2
0
 def _update_plot(self, axis, view):
     kwargs = self.style[self.cyclic_index]
     label = view.label if self.overlaid >= 1 else ''
     if label:
         kwargs['label'] = label
     if self.invert_axes:
         kwargs['vertical'] = True
     sns.distplot(view.dimension_values(0), ax=axis, **kwargs)
Пример #3
0
def plot_angle(data,
               N=50,
               title=None,
               ax1=None,
               ax2=None,
               color=None,
               wrap=True):
    if ax1 is None or ax2 is None:
        gs = gridspec.GridSpec(2, 6)
        ax1 = plt.subplot(gs[:1, :2], polar=True)
        ax2 = plt.subplot(gs[:1, 2:])

    if wrap:
        vf = np.vectorize(wrapAngle)
    else:
        vf = np.vectorize(constrainAngle)
    x = vf(data)

    sns.distplot(x, bins=N, ax=ax2, color=color, kde=True)
    radii, theta = np.histogram(x, bins=N, normed=True)
    ax1.set_yticklabels([])

    if wrap:
        ax1ticks = [0, 45, 90, 135, 180, -135, -90, -45]
        ax2ticks = list(range(-180, 180 + 45, 45))
        ax1.set_xticklabels(['{}°'.format(x) for x in ax1ticks])
        ax2.set_xlim(-180, 180)
        ax2.set_xticks(ax2ticks)
        ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks])

    else:
        ax2ticks = list(range(0, 360 + 45, 45))
        ax2.set_xlim(0, 360)
        ax2.set_xticks(ax2ticks)
        ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks])

    ax2.set_yticks([])
    ax2.set(xlabel='Angle', ylabel='Density')

    sns.despine(ax=ax2)
    width = (2 * np.pi) / N

    ax1.bar(np.deg2rad(theta[1:]), radii, width=width, color=color, alpha=.5)

    if title is not None:
        plt.suptitle(title)

    plt.tight_layout()

    f = plt.gcf()
    return f, (ax1, ax2)
Пример #4
0
def graph_histogram(dataset_id, col, is_categorical, values, part='train'):
    """
    generate the histogram of column col of the dataset

    :param dataset_id: dataset id
    :param col: column name
    :param is_categorical: is the column categorical
    :param values: values of the column
    :param part: set (train, test)
    :return: None
    """
    try:
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                plt.figure(figsize=(7, 7))
                if is_categorical:
                    df = pd.DataFrame(values)
                    df.columns = ['y']
                    encoder = LabelEncoder()
                    df['y'] = encoder.fit_transform(df['y'])
                    values = df['y'].values
                    sns.distplot(values, kde=False)
                    x_labels = encoder.inverse_transform(
                        list(range(max(values) + 1)))
                    plt.xticks(list(range(max(values) + 1)),
                               x_labels,
                               rotation=90)
                else:
                    sns.distplot(values)
                plt.title('distribution of %s (%s set)' % (col, part))
                plt.xlabel('values')
                plt.ylabel('frequencies')
                __save_fig(dataset_id, '_hist_%s_%s' % (part, col), dark)
    except:
        log.error('error in graph_histogram with dataset_id %s' % dataset_id)
Пример #5
0
    def distplot(self, x=None, data=None, *args, **kwargs):
        """
        Flexibly plot a univariate distribution of observations

        Parameters
        ----------
        x : list of str, input variables; these should be column names in data

        data : pandas dataframe

        **kwargs : other arguments in seaborn.distplot

            bins : argument for matplotlib hist(), or None, optional

            hist : bool, optional whether to plot a (normed) histogram

            kde : bool, optional, whether to plot a gaussian kernel \
                density estimate

            rug : bool, optional whether to draw a rugplot on the support axis

            fit : random variable object, optional

            color : matplotlib color, optional

            vertical : bool, optional

            norm_hist : bool, optional

            axlabel : string, False, or None, optional

            label : string, optional

        Returns
        -------
        figure : matplotlib figure with multiple axes

        References
        ----------
        Seaborn distplot further documentation
        https://seaborn.pydata.org/generated/seaborn.distplot.html
        """
        # check data
        if not isinstance(data, (pd.DataFrame)):
            raise ValueError('data must be pandas dataframe')

        # handle single string
        if not isinstance(x, (list, tuple, np.ndarray, pd.Index)):
            x = [x]

        # create fig and axes
        nrows = len(x)
        plt.close()
        fig, axes = plt.subplots(nrows=nrows,
                                 ncols=1,
                                 sharex=self.sharex,
                                 figsize=(self.size[0], nrows * self.size[1]))
        # HACK: handle Axes indexing when only one ax in fig
        if nrows == 1:
            axes = [axes]
        # iterate thru x
        for i, col in enumerate(x):
            # check if col in data
            if col not in data.columns.values:
                raise ValueError('{} is NOT in data'.format(col))
            a = data[col]
            if np.logical_not(np.isfinite(a)).any():
                logger.warning('RUNTIME WARNING: {} column has inf or nan '
                               ''.format(col))
                a = a.replace([-np.inf, np.inf], np.nan).dropna()
            sns.distplot(a=a, ax=axes[i], *args, **kwargs)
            axes[i].set_title(
                label='Univariate Distribution of {}'.format(col),
                fontsize=self.title_fontsize)
            axes[i].set_xlabel(xlabel=col, fontsize=self.label_fontsize)
            axes[i].set_ylabel(ylabel='percentage (%)',
                               fontsize=self.label_fontsize)
            axes[i].tick_params(axis='both',
                                which='maj',
                                labelsize=self.tick_fontsize)
            fig.subplots_adjust(wspace=0.5,
                                hspace=0.3,
                                left=0.125,
                                right=0.9,
                                top=0.9,
                                bottom=0.1)
            fig.tight_layout()
        plt.show()
        return axes
Пример #6
0
 def _update_plot(self, axis, view):
     label = view.label if self.overlaid == 1 else ''
     sns.distplot(view.data, ax=axis, label=label, **self.style)
Пример #7
0
 def _update_plot(self, axis, view):
     sns.distplot(view.data, ax=axis, label=' ', **self.style)
Пример #8
0
def plot_angle(data,
               N=50,
               title=None,
               ax1=None,
               ax2=None,
               color=None,
               wrap=True):
    """
    Plot the distrubution of an angle in polar coordinates and a standard histogram / KDE plot.

    Parameters
    ----------
    data: array-like (nsamples,)
    N: int, optional (default: 50)
        Number of bins to use for histogramming the data
    title: str, optional (default: None)
        The title of the plot
    ax1: matplotlib axis, optional
        The left hand side polar plot
    ax2: matplotlib axis, optional
        The right hand side density plot
    color: str, optional (default: None)
        A color string to use
    wrap: bool, optional (default: True)
        True: Wrap the angle between -180 and 180
        False: Constrain the angle between 0 and 360

    Returns
    -------
    f: matplotlib.figure
        The figure with both axis
    ax1: matplotlib axis, optional
        The left hand side polar plot
    ax2: matplotlib axis, optional
        The right hand side density plot
    """

    if ax1 is None or ax2 is None:
        gs = gridspec.GridSpec(2, 6)
        ax1 = pp.subplot(gs[:1, :2], polar=True)
        ax2 = pp.subplot(gs[:1, 2:])

    if wrap:
        vf = np.vectorize(wrap_angle)
    else:
        vf = np.vectorize(constrain_angle)
    x = vf(data)

    sns.distplot(x, bins=N, ax=ax2, color=color, kde=True)
    radii, theta = np.histogram(x, bins=N, normed=True)
    ax1.set_yticklabels([])

    if wrap:
        ax1ticks = [0, 45, 90, 135, 180, -135, -90, -45]
        ax2ticks = list(range(-180, 180 + 45, 45))
        ax1.set_xticklabels(['{}°'.format(x) for x in ax1ticks])
        ax2.set_xlim(-180, 180)
        ax2.set_xticks(ax2ticks)
        ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks])

    else:
        ax2ticks = list(range(0, 360 + 45, 45))
        ax2.set_xlim(0, 360)
        ax2.set_xticks(ax2ticks)
        ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks])

    ax2.set_yticks([])
    ax2.set(xlabel='Angle', ylabel='Density')

    sns.despine(ax=ax2)
    width = (2 * np.pi) / N

    ax1.bar(np.deg2rad(theta[1:]), radii, width=width, color=color, alpha=.5)

    if title is not None:
        pp.suptitle(title)

    pp.tight_layout()

    f = pp.gcf()
    return f, (ax1, ax2)
Пример #9
0
 def init_artists(self, ax, plot_data, plot_kwargs):
     return {'axis': sns.distplot(*plot_data, ax=ax, **plot_kwargs)}
Пример #10
0
 def init_artists(self, ax, plot_data, plot_kwargs):
     return {'axis': sns.distplot(*plot_data, ax=ax, **plot_kwargs)}
Пример #11
0
import seaborn.apionly as sns
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt

x = np.random.randn(100)
print(type(x))
ax = sns.distplot(x, hist_kws={"ec": "k"})
data_x, data_y = ax.lines[0].get_data()
print(data_x)
print(data_y)
xi = 0  # coordinate where to find the value of kde curve
yi = np.interp(xi, data_x, data_y)
print("x={},y={}".format(xi, yi))  # prints x=0,y=0.3698
ax.plot([0], [yi], marker="o")

fig, subplots = plt.subplots(1, 1)
subplots.plot(data_x, data_y)
plt.show()
Пример #12
0
# In[350]:

categ = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Alone', 'Survived']
conti = ['Fare', 'Age']

#Distribution
fig = plt.figure(figsize=(16, 12))
for i in range(0, len(categ)):
    fig.add_subplot(3, 3, i + 1)
    sns.countplot(x=categ[i], data=df, alpha=.7)

for col in conti:
    fig.add_subplot(3, 3, i + 2)
    sns.distplot(df[col].dropna(),
                 kde_kws={
                     "lw": 2,
                     "color": colors[8]
                 },
                 hist_kws={"alpha": .5})
    i += 1

plt.show()

# In[373]:

fig = plt.figure(figsize=(16, 10))
i = 1
for col in categ:
    if col != 'Survived':
        fig.add_subplot(3, 3, i)
        g = sns.countplot(x=col, data=df, hue='Survived', alpha=.7)
        plt.legend(loc=1)
Пример #13
0
def posterior_predictive_check():

    modeltype = 'hurdle'
    # modeltype = 'negbin'

    if modeltype =='hurdle':
        y = df['days_to_first_price_update'].values
    else:
        y = dfa['days_to_first_price_update'].values

    y_full = y


    #### Import y-pred from RStan
    ypred_full = np.asarray(pd.read_fwf('ypred_m7.rdat')) # brms hurdle model
    ypred_full = np.asarray(pd.read_fwf('ypred_m8.rdat')) # brms hurdle model, main:spec
    # ypred_full = np.asarray(pd.read_fwf('ypred_m6.rdat')) # brms negbin trunc zero
    # y_pred = np.asarray(pd.read_fwf('ypred_m7.rdat')) # rstanarm negbin



    direction = 'Up'
    # direction = 'Down'
    # direction = 'None'
    direction = 'Full'

    amends_direction = np.where(df.amends==direction)[0]

    if direction in ["Up", "Down", "None"]:
        y = y_full[amends_direction]
        y_pred = ypred_full[:, amends_direction]
    else:
        y = y_full
        y_pred = ypred_full



    # pp_check: compare distibutions of test statistics T(y) vs. T(yrep)
    def count_zeros(x):
        return Counter(x)[0]

    test_stats = [np.mean, np.min, np.max, np.median, np.std, count_zeros]
    tnames = ['Mean', 'Min', 'Max', 'Median', 'St-Dev', 'Number of Zeros']

    for test_stat, tname in zip(test_stats, tnames):
        fig = plt.figure(figsize=(10,3))
        fig.add_subplot(111)
        test_stats_rep = [test_stat(yy) for yy in y_pred]
        bayes_pval = round(len([1 for yrep in test_stats_rep if yrep > test_stat(y)]) / len(test_stats_rep), 3)
        pval = r"$Pr(T(y_{{rep}}) > T(y_{{obs}}) | y_{{obs}}) = {}$".format(bayes_pval)
        ax0 = sb.distplot(test_stats_rep, kde=False, label=r"$T(y_{rep})$")
        ax0.axvline(test_stat(y), color=blue, linewidth=4, label=r"$T(y_{obs})$")
        ab = AnnotationBbox(TextArea(pval,
                textprops={'fontsize':14}), (570, 100),
                xycoords='figure points',
                bboxprops={'boxstyle': 'square', 'fc':'#efefef', 'ec': '#9f9f9f'})
        ax0.add_artist(ab)
        plt.title("Test statistic: {}".format(tname))
        plt.legend(fontsize=14)
        plt.savefig("fig-{}-{}-hurdle".format(direction, tname))
        plt.close()

    # if modeltype == 'negbin':
    #     fig = plt.figure(figsize=(10,3))
    #     fig.add_subplot(111)
    #     test_stats_rep = [test_stat(yy) for yy in y_pred]
    #     bayes_pval = round(len([1 for yrep in test_stats_rep if yrep > test_stat(y)]) / len(test_stats_rep), 3)
    #     pval = r"$Pr(T(y_{{rep}}) > T(y_{{obs}}) | y_{{obs}}) = {}$".format(bayes_pval)
    #     ax0 = sb.distplot(test_stats_rep, kde=False, label=r"$T(y_{rep})$")
    #     ax0.axvline(test_stat(y), color=blue, linewidth=4, label=r"$T(y_{obs})$")
    #     ab = AnnotationBbox(TextArea(pval,
    #             textprops={'fontsize':14}), (570, 100),
    #             xycoords='figure points',
    #             bboxprops={'boxstyle': 'square', 'fc':'#efefef', 'ec': '#9f9f9f'})
    #     ax0.add_artist(ab)
    #     plt.title("Test statistic: {}".format(tname))
    #     plt.legend(fontsize=14)
    #     plt.savefig("fig-teststat-{}".format(test_stat.__name__))



    # Posterior Predictive Distributions
    x_lim=120
    y_lim=50

    if direction == 'Up':
        color=blue
    elif direction == 'Down':
        color=purple
    elif direction == 'None':
        color='black'
    else:
        color=red

    # observed y
    fig = plt.figure(figsize=(10,6))
    ax1 = fig.add_subplot(211)
    _ = plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.7, color=color)
    _ = plt.title('Panel A: Distribution of Observed Data', fontsize='large')
    ax1.axvline(np.mean(y), linestyle='-', color='black', label='Mean')
    ax1.axvline(np.percentile(y, 50), linestyle='--', color='black', label='Median')
    ax1.axvline(np.percentile(y, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile')
    ax1.axvline(np.percentile(y, 95), linestyle='-.', color='black')
    _ = plt.ylabel('Frequency')
    # _ = plt.xlabel('Days to Price Amendment')
    plt.ylim(0, y_lim)




    ax2 = fig.add_subplot(212)
    ypred2 = y_pred[-200:-100:20]
    ax2.axvline(np.mean(ypred2), linestyle='-', color='black', label='Mean')
    ax2.axvline(np.percentile(ypred2, 50), linestyle='--', color='black', label='Median')
    ax2.axvline(np.percentile(ypred2, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile')
    ax2.axvline(np.percentile(ypred2, 95), linestyle='-.', color='black')
    _ = [plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.2, color=color) for y in ypred2]
    _ = plt.xlim(0, x_lim)
    _ = plt.title('Panel B: Posterior Predictive Distribution', fontsize='large')
    _ = plt.ylabel('Frequency')
    _ = plt.xlabel('Days to Price Amendment')
    plt.legend(fontsize='small')
    plt.ylim(0, y_lim)

    # ax3 = fig.add_subplot(313)
    # ypred3 = y_pred[-300:-200:20]
    # ax3.axvline(np.mean(ypred3), linestyle='-', color='black', label='Mean')
    # ax3.axvline(np.percentile(ypred3, 50), linestyle='--', color='black', label='Median')
    # ax3.axvline(np.percentile(ypred3, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile')
    # ax3.axvline(np.percentile(ypred3, 95), linestyle='-.', color='black')
    # _ = [plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.2, color=blue) for y in ypred3]
    # _ = plt.xlim(0, x_lim)
    # _ = plt.xlabel('Days to Price Amendment')
    # _ = plt.title('Panel C: Posterior Predictive Distribution (#2)', fontsize='large')
    # plt.legend(fontsize='small')

    if modeltype == 'hurdle':
        plt.savefig('figA_posterior-pred-check-hurdle-{}'.format(direction))
    elif modeltype == 'negbin':
        plt.savefig('figA_posterior-pred-check2')
    else:
        plt.savefig('figA_posterior_pred-check?')
Пример #14
0
                           size=4000)  # Pick 4000 people, and give them groups
full_counts = np.random.poisson(lams[choices])  # Count their visits
truncated_counts = full_counts[full_counts >
                               0]  # Remove any counts that are zero
truncated_choices = choices[
    full_counts > 0]  # And also find the groups for those non-zero visitors
trunc_size = truncated_counts.size
colors = sns.color_palette(n_colors=2)

#%% Setup/Plot Dummy Data

lam = 1
full_counts = np.random.poisson(lam, size=2000)
truncated_counts = full_counts[full_counts > 0]

sns.distplot(full_counts, bins=np.arange(10), kde=False, norm_hist=True)
sns.distplot(truncated_counts, bins=np.arange(10), kde=False, norm_hist=True)

#%% Full Counts

with pm.Model():
    lam = pm.HalfNormal('lam', 10)
    pm.Poisson('obs', mu=lam, observed=full_counts)

    trace = pm.sample(2500, cores=1)
    pm.traceplot(trace)

plt.figure()
sns.distplot(trace.lam)
plt.axvline(1)
Пример #15
0
def pdf_sns(y,nBins=50):
    import seaborn.apionly as sns
    hh=sns.distplot(y,hist=True,norm_hist=False).get_lines()[0].get_data()
    xh=hh[0]
    yh=hh[1]
    return xh,yh
Пример #16
0
Файл: pbo.py Проект: jijoy/pypbo
def plot_pbo(pbo_result, hist=False):
    lm = pbo_result.linear_model

    wid, h = plt.rcParams.get('fig.figsize', (10, 5))
    nplots = 3
    fig, axarr = plt.subplots(nplots, 1, sharex=False)
    fig.set_size_inches((wid, h * nplots))

    r2 = lm.rvalue**2
    # adj_r2 = r2 - (1 - r2) / (len(pbo_result.R_n_star) - 2.0)
    line_label = 'slope: {:.4f}\n'.format(lm.slope) + \
                 'p: {:.4E}\n'.format(lm.pvalue) + \
                 '$R^2$: {:.4f}\n'.format(r2) + \
                 'Prob. OOS Loss: {:.1%}'.format(pbo_result.prob_oos_loss)

    sns.regplot(
        x='SR_IS',
        y='SR_OOS',
        # sns.lmplot(x='SR_IS', y='SR_OOS',
        data=pd.DataFrame(
            dict(SR_IS=pbo_result.R_n_star, SR_OOS=pbo_result.R_bar_n_star)),
        scatter_kws={
            'alpha': .3,
            'color': 'g'
        },
        line_kws={
            'alpha': .8,
            'label': line_label,
            'linewidth': 1.,
            'color': 'r'
        },
        ax=axarr[0])
    axarr[0].set_title('Performance Degradation, IS vs. OOS')
    axarr[0].legend(loc='best')

    # TODO hist is turned off at the moment. Error occurs when S is set to
    # a relatively large number, such as 16.
    sns.distplot(pbo_result.logits,
                 rug=True,
                 bins=10,
                 ax=axarr[1],
                 rug_kws={
                     'color': 'r',
                     'alpha': .5
                 },
                 kde_kws={
                     'color': 'k',
                     'lw': 2.,
                     'label': 'KDE'
                 },
                 hist=hist,
                 hist_kws={
                     'histtype': 'step',
                     'linewidth': 2.,
                     'alpha': .7,
                     'color': 'g'
                 })
    axarr[1].axvline(0, c='r', ls='--')
    axarr[1].set_title('Hist. of Rank Logits')
    axarr[1].set_xlabel('Logits')
    axarr[1].set_ylabel('Frequency')

    pbo_result.stochastic.plot(secondary_y='SD2', ax=axarr[2])
    axarr[2].right_ax.axhline(0, c='r')
    axarr[2].set_title('Stochastic Dominance')
    axarr[2].set_ylabel('Frequency')
    axarr[2].set_xlabel('SR Optimized vs. Non-Optimized')
    axarr[2].right_ax.set_ylabel('2nd Order Stoch. Dominance')
    plt.show()
Пример #17
0
 def _update_plot(self, axis, view):
     label = view.label if self.overlaid == 1 else ''
     sns.distplot(view.data, ax=axis, label=label, **self.style)
ben_freq = 0
for w in y:
    if w == 'M':
        mal_freq += 1
    if w == 'B':
        ben_freq += 1
print("Malignant: " + str(mal_freq))
print("Benign: " + str(ben_freq))


# In[5]:


# This plots the mean radii of the tumors and color codes if they are benign or malignant. 
datas = data[data.type == 'M'] # this takes the ones that are type M
sns.distplot(datas['mean radius'],  kde=False, label='Malignant') # this plots the mean radii of the malignant ones
datas = data[data.type == 'B'] # this takes the ones that are type B
sns.distplot(datas['mean radius'],  kde=False, label='Benign') # this plots the mean radii of the benign ones
plt.legend() # this adds a legend
plt.title("Mean Radii") # title
plt.xlabel("mean radius") # labels
plt.ylabel("frequency")


# In[6]:


# This plots the mean texture of the tumors and color codes if they are benign or malignant.
datas = data[data.type == 'M'] # this takes the ones that are type M
sns.distplot(datas['mean texture'],  kde=False, label='Malignant') # this plots the mean radii of the malignant ones
datas = data[data.type == 'B'] # this takes the ones that are type B