예제 #1
0
파일: graphs.py 프로젝트: syaffa/automlk
def graph_classification_numerical(dataset_id, df, col, target):
    """
    display a horizontal boxplot graph of col in x axis and target in y axis

    :param dataset_id: id of the dataset
    :param df: dataframe, with col and target values
    :param col: name of column
    :param target: name of target column
    :return:
    """
    try:
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                plt.figure(figsize=(8, 7))
                encoder = LabelEncoder()
                y = encoder.fit_transform(df[target].values)
                y_labels = encoder.inverse_transform(list(range(max(y) + 1)))
                sns.boxplot(x=col, y=target, data=df, orient='h')
                plt.xlim(__standard_range(df[col].values, 1, 99))
                plt.yticks(list(range(max(y) + 1)), y_labels)
                __save_fig(dataset_id, '_col_' + col, dark)
    except:
        log.error(
            'error in graph_classification_numerical with dataset_id %s' %
            dataset_id)
예제 #2
0
파일: graphs.py 프로젝트: syaffa/automlk
def graph_regression_categorical(dataset_id, df, col, target):
    """
    display a boxplot graph of col in x axis and target in y axis

    :param dataset_id: id of the dataset
    :param df: dataframe, with col and target values
    :param col: name of column
    :param target: name of target column
    :return:
    """
    try:
        for dark, theme in [(True, 'dark_background'),
                            (False, 'seaborn-whitegrid')]:
            with plt.style.context(theme, after_reset=True):
                encoder = LabelEncoder()
                x = encoder.fit_transform(df[col].values)
                x_labels = encoder.inverse_transform(list(range(max(x) + 1)))
                fig, ax = plt.subplots(figsize=(8, 7))
                sns.boxplot(x=col, y=target, data=df, ax=ax)
                plt.xticks(list(range(max(x) + 1)), x_labels, rotation=90)
                plt.ylim(__standard_range(df[target].values, 1, 99))
                __save_fig(dataset_id, '_col_' + col, dark)
    except:
        log.error('error in graph_regression_categorical with dataset_id %s' %
                  dataset_id)
예제 #3
0
def plotDeltaNextDiscVs(df, vs, width=1, height=None):
    fig, ax = newfig(width, height)

    dfNoNan = df[~df.deltaNext.isnull()]
    retDays = pd.TimedeltaIndex(dfNoNan.deltaNext.values).days
    # sns.regplot(x=vs, y='deltaNext', data=dfNoNan)
    sns.boxplot(x=dfNoNan[vs].values, y=np.log10(retDays), ax=ax)

    yTicks = np.array(range(0, int(np.floor(np.log10(retDays.max()))) + 1))
    ax.set_yticks(yTicks)
    ax.set_yticklabels(10**yTicks)
    # ax.set_ylabel('Return time')
    fig.show()
예제 #4
0
 def _update_plot(self, axis, view):
     if self.plot_type == 'regplot':
         sns.regplot(x=view.x,
                     y=view.y,
                     data=view.data,
                     ax=axis,
                     **self.style)
     elif self.plot_type == 'boxplot':
         self.style.pop('return_type', None)
         self.style.pop('figsize', None)
         sns.boxplot(view.data[view.y],
                     view.data[view.x],
                     ax=axis,
                     **self.style)
     elif self.plot_type == 'violinplot':
         sns.violinplot(view.data[view.y],
                        view.data[view.x],
                        ax=axis,
                        **self.style)
     elif self.plot_type == 'interact':
         sns.interactplot(view.x,
                          view.x2,
                          view.y,
                          data=view.data,
                          ax=axis,
                          **self.style)
     elif self.plot_type == 'corrplot':
         sns.corrplot(view.data, ax=axis, **self.style)
     elif self.plot_type == 'lmplot':
         sns.lmplot(x=view.x,
                    y=view.y,
                    data=view.data,
                    ax=axis,
                    **self.style)
     elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']:
         map_opts = [(k, self.style.pop(k)) for k in self.style.keys()
                     if 'map' in k]
         if self.plot_type == 'pairplot':
             g = sns.pairplot(view.data, **self.style)
         elif self.plot_type == 'pairgrid':
             g = sns.PairGrid(view.data, **self.style)
         elif self.plot_type == 'facetgrid':
             g = sns.FacetGrid(view.data, **self.style)
         for opt, args in map_opts:
             plot_fn = getattr(sns, args[0]) if hasattr(
                 sns, args[0]) else getattr(plt, args[0])
             getattr(g, opt)(plot_fn, *args[1:])
         plt.close(self.handles['fig'])
         self.handles['fig'] = plt.gcf()
     else:
         super(SNSFramePlot, self)._update_plot(axis, view)
예제 #5
0
def plotInteractionVsDevice(df, width=1, height=None):
    fig, ax = newfig(width, height)

    interactions = sum(
        df[col]
        for col in ['changeThumbnail', 'imageZoom', 'watchVideo', 'view360'])
    sns.boxplot(x=df.device.values, y=np.log10(interactions.values + 1))

    yTicks = np.array(range(0,
                            int(np.floor(np.log10(interactions.max()))) + 1))
    yTicks[0] = 1
    ax.set_yticks(yTicks)
    ax.set_yticklabels(10**yTicks)
    ax.set_ylabel('number of interactions')

    fig.show()
예제 #6
0
 def _update_plot(self, axis, view):
     style = self._process_style(self.style[self.cyclic_index])
     if self.plot_type == 'factorplot':
         opts = dict(style, **({'hue': view.x2} if view.x2 else {}))
         sns.factorplot(x=view.x, y=view.y, data=view.data, **opts)
     elif self.plot_type == 'regplot':
         sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style)
     elif self.plot_type == 'boxplot':
         style.pop('return_type', None)
         style.pop('figsize', None)
         sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style)
     elif self.plot_type == 'violinplot':
         if view.x:
             sns.violinplot(view.data[view.y],
                            view.data[view.x],
                            ax=axis,
                            **style)
         else:
             sns.violinplot(view.data, ax=axis, **style)
     elif self.plot_type == 'interact':
         sns.interactplot(view.x,
                          view.x2,
                          view.y,
                          data=view.data,
                          ax=axis,
                          **style)
     elif self.plot_type == 'corrplot':
         sns.corrplot(view.data, ax=axis, **style)
     elif self.plot_type == 'lmplot':
         sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style)
     elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']:
         style_keys = list(style.keys())
         map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k]
         if self.plot_type == 'pairplot':
             g = sns.pairplot(view.data, **style)
         elif self.plot_type == 'pairgrid':
             g = sns.PairGrid(view.data, **style)
         elif self.plot_type == 'facetgrid':
             g = sns.FacetGrid(view.data, **style)
         for opt, args in map_opts:
             plot_fn = getattr(sns, args[0]) if hasattr(
                 sns, args[0]) else getattr(plt, args[0])
             getattr(g, opt)(plot_fn, *args[1:])
         plt.close(self.handles['fig'])
         self.handles['fig'] = plt.gcf()
     else:
         super(SNSFramePlot, self)._update_plot(axis, view)
예제 #7
0
 def _update_plot(self, axis, view):
     style = self._process_style(self.style[self.cyclic_index])
     if self.plot_type == 'factorplot':
         opts = dict(style, **({'hue': view.x2} if view.x2 else {}))
         sns.factorplot(x=view.x, y=view.y, data=view.data, **opts)
     elif self.plot_type == 'regplot':
         sns.regplot(x=view.x, y=view.y, data=view.data,
                     ax=axis, **style)
     elif self.plot_type == 'boxplot':
         style.pop('return_type', None)
         style.pop('figsize', None)
         sns.boxplot(view.data[view.y], view.data[view.x], ax=axis,
                     **style)
     elif self.plot_type == 'violinplot':
         if view.x:
             sns.violinplot(view.data[view.y], view.data[view.x], ax=axis,
                            **style)
         else:
             sns.violinplot(view.data, ax=axis, **style)
     elif self.plot_type == 'interact':
         sns.interactplot(view.x, view.x2, view.y,
                          data=view.data, ax=axis, **style)
     elif self.plot_type == 'corrplot':
         sns.corrplot(view.data, ax=axis, **style)
     elif self.plot_type == 'lmplot':
         sns.lmplot(x=view.x, y=view.y, data=view.data,
                    ax=axis, **style)
     elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']:
         style_keys = list(style.keys())
         map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k]
         if self.plot_type == 'pairplot':
             g = sns.pairplot(view.data, **style)
         elif self.plot_type == 'pairgrid':
             g = sns.PairGrid(view.data, **style)
         elif self.plot_type == 'facetgrid':
             g = sns.FacetGrid(view.data, **style)
         for opt, args in map_opts:
             plot_fn = getattr(sns, args[0]) if hasattr(sns, args[0]) else getattr(plt, args[0])
             getattr(g, opt)(plot_fn, *args[1:])
         if self._close_figures:
             plt.close(self.handles['fig'])
         self.handles['fig'] = plt.gcf()
     else:
         super(SNSFramePlot, self)._update_plot(axis, view)
예제 #8
0
    def boxplot(self, x=None, y=None, hue=None, data=None, *args, **kwargs):
        """
        Draw a box plot to show distributions with respect to categories

        Parameters
        ----------
        x : the name of a variable in data that provides labels for categories

        y : a list of names of variables in data that need to visualize \
            distribution

        hue : the name of a variable in data that provides labels for \
            sub-categories in each big category

        data : pandas dataframe

        **kwargs : other arguments in seaborn.boxplot

            order, hue_order : lists of strings, optional

            orient : 'v' | 'h', optional

            color : matplotlib color, optional

            palette : palette name, list, or dict, optional

            saturation : float, optional

            width : float, optional

            dodge : bool, optional

            fliersize : float, optional

            linewidth : float, optional

            whis : float, optional

            notch : boolean, optional

        Returns
        -------
        figure : matplotlib figure with multiple axes

        References
        ----------
        Seaborn boxplot further documentation
        https://seaborn.pydata.org/generated/seaborn.boxplot.html
        """
        # check data
        if not isinstance(data, (pd.DataFrame)):
            raise ValueError('data must be pandas dataframe')

        # check x and hue
        if x is not None:
            if x not in data.columns.values:
                raise ValueError('{} is NOT in data'.format(x))
        if hue is not None:
            if hue not in data.columns.values:
                raise ValueError('{} is NOT in data'.format(hue))

        # handle single string
        if not isinstance(y, (list, tuple, np.ndarray, pd.Index)):
            y = [y]

        # create fig and axes
        nrows = len(y)
        plt.close()
        fig, axes = plt.subplots(nrows=nrows,
                                 ncols=1,
                                 sharex=self.sharex,
                                 figsize=(self.size[0], nrows * self.size[1]))
        # HACK: handle Axes indexing when only one ax in fig
        if nrows == 1:
            axes = [axes]
        # iterate thru x
        for i, col in enumerate(y):
            # check if col in data
            if col not in data.columns.values:
                raise ValueError('{} is NOT in data'.format(col))
            a = data[col]
            not_nan = np.ones(a.shape[0], dtype=np.bool)
            if np.logical_not(np.isfinite(a)).any():
                logger.warning('RUNTIME WARNING: {} column has inf or nan '
                               ''.format(col))
                a = a.replace([-np.inf, np.inf], np.nan)
                # filter
                not_nan = np.logical_not(a.isnull())
            # plot
            sns.boxplot(x=x,
                        y=col,
                        hue=hue,
                        data=data[not_nan],
                        ax=axes[i],
                        *args,
                        **kwargs)
            if x is not None:
                axes[i].set_title(
                    label='Box Distribution of {} With Respect To {} '
                    ''.format(col, x),
                    fontsize=self.title_fontsize)
                axes[i].set_xlabel(xlabel=x, fontsize=self.label_fontsize)
                axes[i].set_ylabel(ylabel=col, fontsize=self.label_fontsize)
            else:  # x is None
                axes[i].set_title(label='Box Distribution of {}'.format(col),
                                  fontsize=self.title_fontsize)
                axes[i].set_xlabel(xlabel=col, fontsize=self.label_fontsize)
                axes[i].set_ylabel(ylabel='value',
                                   fontsize=self.label_fontsize)
            axes[i].tick_params(axis='both',
                                which='maj',
                                labelsize=self.tick_fontsize)
            axes[i].legend(loc='lower right')
            fig.subplots_adjust(wspace=0.5,
                                hspace=0.3,
                                left=0.125,
                                right=0.9,
                                top=0.9,
                                bottom=0.1)
            fig.tight_layout()
        plt.show()
        return axes
    	final_error[l, :] = data[-1];
    	print data[-1]
    reproj_errors.append(final_error)

    for l in range(0, len(datas_extr)):
        data = datas_extr[l]
    	final_extr_error[l, :] = numpy.sqrt(data[8] ** 2 + data[9] ** 2 + data[10] ** 2) * 100
    extr_errors.append(final_extr_error);
    
    for l in range(0, len(datas_joints)):
        data = datas_joints[l]
        final_joint_error[l * d : l * d + d, :] = numpy.median(numpy.abs(data[:, 0:d] - data[:, d:(2 * d)]))* (180 / 3.14159);
    joint_errors.append(final_joint_error.flatten())

f1 = plt.figure();
sns.boxplot(data=reproj_errors, color=(0.5, 0.6, 1.0, 0), linewidth=0.5, saturation=0.1, fliersize=1);
plt.ylim([0, 8])
plt.xticks(xes, dofs)
plt.xlabel('\# Degrees of Freedom')
plt.ylabel('Median Reprojection Error (pixels)')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')

f2 = plt.figure();
sns.boxplot(data=extr_errors, color=(0.5, 0.6, 1.0, 0), linewidth=0.5, saturation=0.1, fliersize=1);
plt.xticks(xes, dofs)
plt.xlabel('\# Degrees of Freedom')
plt.ylabel('Extrinsic Error (cm)')
plt.gca().spines['top'].set_visible(False)
        yt = treated['ach_score']
        tr_q = get_quantile(xt, yt, grd, n_epochs=n_epc)

        reg = untreated.sample(n=untreated.shape[0], replace=False)
        xt = untreated['exp']
        yt = untreated['ach_score']
        utr_q = get_quantile(xt, yt, grd, n_epochs=n_epc)

        for cnt, tau in enumerate(grd):
            qte_grd[i, cnt] = np.mean(tr_q[cnt]) - np.mean(utr_q[cnt])

    mean, std = np.mean(qte_grd, axis=0), np.std(qte_grd, axis=0)

    print('QTE per requested quantile: mean(std)')
    for tau, m, s in zip(grd, mean, std):
        print('$\\tau = ' + str(tau) + ': ' + str(round(m, 2)) + '(' +
              str(round(s, 2)) + ')' + '$')

    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 18
    plt.figure(figsize=(12, 7))

    ax = sns.boxplot(data=qte_grd, palette="Blues")
    ax.set_xticklabels(np.around(np.linspace(0.1, 0.9, 10), decimals=1))
    plt.xlabel('$\\tau$')
    plt.ylabel('Quantile Treatment Effect ')

    plt.show()
    ax.figure.savefig('../figures/heterogeneous_qte_' + str(n_rep) + '_' +
                      str(n_epc) + '.pdf')
예제 #11
0
messages['is_weekend'] = messages['day_of_week'].isin([5,6]).apply(lambda x: 1 if x == True else 0)

# Limit to messages sent by me and exclude all messages between me and Alison
messages = messages[(messages['sender'] == 'Mark Regan') & (messages['participants_str'] != 'Alison Darcy, Mark Regan')]

# Remove messages not responded within 60 seconds
# This introduces an issue by right censoring the data (might return to address)
messages = messages[messages['time_delay_seconds'] < 60]

messages.head(1)

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(211)

order = np.sort(messages['year_month'].unique())
sns.boxplot(x=messages['year_month'], y=messages['time_delay_seconds'], order=order, orient="v", color=colors[5], linewidth=1, ax=ax)
_ = ax.set_title('Response time distribution by month')
_ = ax.set_xlabel('Month-Year')
_ = ax.set_ylabel('Response time')
_ = plt.xticks(rotation=30)

ax = fig.add_subplot(212)
plt.hist(messages['time_delay_seconds'].values, range=[0, 60], bins=60, histtype='stepfilled', color=colors[0])
_ = ax.set_title('Response time distribution')
_ = ax.set_xlabel('Response time (seconds)')
_ = ax.set_ylabel('Number of messages')

plt.tight_layout()

# excluded some colums from csv output
messages.drop(['participants', 'message', 'participants_str'], axis=1, inplace=True)
예제 #12
0
def plot_results(transformation):
    res_dir = '../results'

    _, dir_sigmas, _ = next(os.walk(res_dir))
    dir_sigmas = [ds for ds in dir_sigmas if ds.find(transformation) == 0]
    sigmas = [float(ds[len(transformation) + 1:]) for ds in dir_sigmas]
    idx_sigmas = np.argsort(sigmas)
    sigmas = [sigmas[i] for i in idx_sigmas]
    dir_sigmas = [dir_sigmas[i] for i in idx_sigmas]

    sigma_miss_err = {}
    sigma_times = {'PM': {}, 'NMU': {}, 'TOTAL': {}}
    example_miss_err = {}
    res_files = ['{}/{}/test.txt'.format(res_dir, ds) for ds in dir_sigmas]

    # Very crude parser, do not change console printing output
    # or this will break
    for s, rf in zip(sigmas, res_files):
        with open(rf, 'r') as file_contents:
            sigma_miss_err[s] = []
            sigma_times['PM'][s] = []
            sigma_times['NMU'][s] = []
            sigma_times['TOTAL'][s] = []
            for i, line in enumerate(file_contents):
                if line.find('Statistics') == 0:
                    break
                if i % 10 == 0:
                    example = line[:-5]
                if i % 10 == 3:
                    t = float(line.split()[4])
                    sigma_times['PM'][s].append(t)
                if i % 10 == 4:
                    t = float(line.split()[2])
                    sigma_times['NMU'][s].append(t)
                if i % 10 == 7:
                    t = float(line.split()[2])
                    sigma_times['TOTAL'][s].append(t)
                if i % 10 == 8:
                    pr = 100 * float(line.split()[3][:-1])
                    if example not in example_miss_err:
                        example_miss_err[example] = []
                    example_miss_err[example].append(pr)
                    sigma_miss_err[s].append(pr)

    def sort_dict(d):
        return collections.OrderedDict(sorted(d.items()))

    example_miss_err = sort_dict(example_miss_err)
    sigma_miss_err = sort_dict(sigma_miss_err)
    sigma_times['PM'] = sort_dict(sigma_times['PM'])
    sigma_times['NMU'] = sort_dict(sigma_times['NMU'])
    sigma_times['TOTAL'] = sort_dict(sigma_times['TOTAL'])

    def round2(vals, decimals=2):
        return np.round(vals, decimals=decimals)

    print('Misclassification error')
    for key in sigma_miss_err:
        values = np.array(sigma_miss_err[key])
        stats = (key, round2(np.mean(values)), round2(np.median(values)),
                 round2(np.std(values, ddof=1)))
        fmt_str = 'sigma: {}\tmean: {}\tmedian: {}\tstd: {}'
        print(fmt_str.format(*stats))
        # print('\t', values)

    with sns.axes_style("whitegrid"):
        values = np.array(list(sigma_miss_err.values())).T
        max_val = values.max()

        plt.figure()
        sns.boxplot(data=values, color='.95', whis=100)
        sns.stripplot(data=values, jitter=True)
        sigmas_text = ['{:.2f}'.format(s) for s in sigmas]
        plt.xticks(range(len(sigmas)), sigmas_text, size='x-large')
        yticks = [yt for yt in plt.yticks()[0] if yt >= 0]
        plt.yticks(yticks, size='x-large')
        plt.xlabel(r'$\sigma$', size='x-large')
        plt.ylabel('Misclassification error (%)', size='x-large')
        plt.ylim((-2, 10 * np.ceil(max_val / 10)))
        if transformation == 'homography':
            plt.title('Homographies', size='x-large')
        if transformation == 'fundamental':
            plt.title('Fundamental matrices', size='x-large')
        plt.tight_layout()
        plt.savefig('{}/{}_result.pdf'.format(res_dir, transformation),
                    bbox_inches='tight')

    print('Time')
    for key in sigma_miss_err:
        mean_PM = round2(np.mean(np.array(sigma_times['PM'][key])))
        mean_NMU = round2(np.mean((np.array(sigma_times['NMU'][key]))))
        mean_total = round2(np.mean((np.array(sigma_times['TOTAL'][key]))))
        stats = (key, mean_total, round2(mean_PM / mean_total),
                 round2(mean_NMU / mean_total))
        fmt_str = 'sigma: {}\tTOTAL: {}\tRATIO PM: {}\tRATIO NMU: {}'
        print(fmt_str.format(*stats))
예제 #13
0
show()

peace_age = peace_df['Age Category'].value_counts()
print(peace_age)
plt.pie(peace_age, labels=peace_age.index, autopct='%1.1f%%')
plt.show()


sns.jointplot(x="Year",
        y="Age",
        kind='reg',
        data=data)

plt.show()
sns.boxplot(data=data,
         x='Category',
         y='Age')

plt.show()
sns.lmplot('Year','Age',data=data,lowess=True, aspect=2,  line_kws={'color' : 'black'})
plt.show()


# Question 2: What words are most frequently written in the prize motivation?
top_N = 10
stopwords = nltk.corpus.stopwords.words('english')
re_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
words = (data['Motivation']
         .str.lower()
         .replace([r'\|', re_stopwords], [' ', ' '], regex=True)
         .str.cat(sep=' ')