def graph_classification_numerical(dataset_id, df, col, target): """ display a horizontal boxplot graph of col in x axis and target in y axis :param dataset_id: id of the dataset :param df: dataframe, with col and target values :param col: name of column :param target: name of target column :return: """ try: for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): plt.figure(figsize=(8, 7)) encoder = LabelEncoder() y = encoder.fit_transform(df[target].values) y_labels = encoder.inverse_transform(list(range(max(y) + 1))) sns.boxplot(x=col, y=target, data=df, orient='h') plt.xlim(__standard_range(df[col].values, 1, 99)) plt.yticks(list(range(max(y) + 1)), y_labels) __save_fig(dataset_id, '_col_' + col, dark) except: log.error( 'error in graph_classification_numerical with dataset_id %s' % dataset_id)
def graph_regression_categorical(dataset_id, df, col, target): """ display a boxplot graph of col in x axis and target in y axis :param dataset_id: id of the dataset :param df: dataframe, with col and target values :param col: name of column :param target: name of target column :return: """ try: for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): encoder = LabelEncoder() x = encoder.fit_transform(df[col].values) x_labels = encoder.inverse_transform(list(range(max(x) + 1))) fig, ax = plt.subplots(figsize=(8, 7)) sns.boxplot(x=col, y=target, data=df, ax=ax) plt.xticks(list(range(max(x) + 1)), x_labels, rotation=90) plt.ylim(__standard_range(df[target].values, 1, 99)) __save_fig(dataset_id, '_col_' + col, dark) except: log.error('error in graph_regression_categorical with dataset_id %s' % dataset_id)
def plotDeltaNextDiscVs(df, vs, width=1, height=None): fig, ax = newfig(width, height) dfNoNan = df[~df.deltaNext.isnull()] retDays = pd.TimedeltaIndex(dfNoNan.deltaNext.values).days # sns.regplot(x=vs, y='deltaNext', data=dfNoNan) sns.boxplot(x=dfNoNan[vs].values, y=np.log10(retDays), ax=ax) yTicks = np.array(range(0, int(np.floor(np.log10(retDays.max()))) + 1)) ax.set_yticks(yTicks) ax.set_yticklabels(10**yTicks) # ax.set_ylabel('Return time') fig.show()
def _update_plot(self, axis, view): if self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **self.style) elif self.plot_type == 'boxplot': self.style.pop('return_type', None) self.style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **self.style) elif self.plot_type == 'violinplot': sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **self.style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **self.style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **self.style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **self.style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: map_opts = [(k, self.style.pop(k)) for k in self.style.keys() if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **self.style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **self.style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **self.style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr( sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def plotInteractionVsDevice(df, width=1, height=None): fig, ax = newfig(width, height) interactions = sum( df[col] for col in ['changeThumbnail', 'imageZoom', 'watchVideo', 'view360']) sns.boxplot(x=df.device.values, y=np.log10(interactions.values + 1)) yTicks = np.array(range(0, int(np.floor(np.log10(interactions.max()))) + 1)) yTicks[0] = 1 ax.set_yticks(yTicks) ax.set_yticklabels(10**yTicks) ax.set_ylabel('number of interactions') fig.show()
def _update_plot(self, axis, view): style = self._process_style(self.style[self.cyclic_index]) if self.plot_type == 'factorplot': opts = dict(style, **({'hue': view.x2} if view.x2 else {})) sns.factorplot(x=view.x, y=view.y, data=view.data, **opts) elif self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'boxplot': style.pop('return_type', None) style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style) elif self.plot_type == 'violinplot': if view.x: sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **style) else: sns.violinplot(view.data, ax=axis, **style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: style_keys = list(style.keys()) map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr( sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def _update_plot(self, axis, view): style = self._process_style(self.style[self.cyclic_index]) if self.plot_type == 'factorplot': opts = dict(style, **({'hue': view.x2} if view.x2 else {})) sns.factorplot(x=view.x, y=view.y, data=view.data, **opts) elif self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'boxplot': style.pop('return_type', None) style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style) elif self.plot_type == 'violinplot': if view.x: sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **style) else: sns.violinplot(view.data, ax=axis, **style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: style_keys = list(style.keys()) map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr(sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) if self._close_figures: plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def boxplot(self, x=None, y=None, hue=None, data=None, *args, **kwargs): """ Draw a box plot to show distributions with respect to categories Parameters ---------- x : the name of a variable in data that provides labels for categories y : a list of names of variables in data that need to visualize \ distribution hue : the name of a variable in data that provides labels for \ sub-categories in each big category data : pandas dataframe **kwargs : other arguments in seaborn.boxplot order, hue_order : lists of strings, optional orient : 'v' | 'h', optional color : matplotlib color, optional palette : palette name, list, or dict, optional saturation : float, optional width : float, optional dodge : bool, optional fliersize : float, optional linewidth : float, optional whis : float, optional notch : boolean, optional Returns ------- figure : matplotlib figure with multiple axes References ---------- Seaborn boxplot further documentation https://seaborn.pydata.org/generated/seaborn.boxplot.html """ # check data if not isinstance(data, (pd.DataFrame)): raise ValueError('data must be pandas dataframe') # check x and hue if x is not None: if x not in data.columns.values: raise ValueError('{} is NOT in data'.format(x)) if hue is not None: if hue not in data.columns.values: raise ValueError('{} is NOT in data'.format(hue)) # handle single string if not isinstance(y, (list, tuple, np.ndarray, pd.Index)): y = [y] # create fig and axes nrows = len(y) plt.close() fig, axes = plt.subplots(nrows=nrows, ncols=1, sharex=self.sharex, figsize=(self.size[0], nrows * self.size[1])) # HACK: handle Axes indexing when only one ax in fig if nrows == 1: axes = [axes] # iterate thru x for i, col in enumerate(y): # check if col in data if col not in data.columns.values: raise ValueError('{} is NOT in data'.format(col)) a = data[col] not_nan = np.ones(a.shape[0], dtype=np.bool) if np.logical_not(np.isfinite(a)).any(): logger.warning('RUNTIME WARNING: {} column has inf or nan ' ''.format(col)) a = a.replace([-np.inf, np.inf], np.nan) # filter not_nan = np.logical_not(a.isnull()) # plot sns.boxplot(x=x, y=col, hue=hue, data=data[not_nan], ax=axes[i], *args, **kwargs) if x is not None: axes[i].set_title( label='Box Distribution of {} With Respect To {} ' ''.format(col, x), fontsize=self.title_fontsize) axes[i].set_xlabel(xlabel=x, fontsize=self.label_fontsize) axes[i].set_ylabel(ylabel=col, fontsize=self.label_fontsize) else: # x is None axes[i].set_title(label='Box Distribution of {}'.format(col), fontsize=self.title_fontsize) axes[i].set_xlabel(xlabel=col, fontsize=self.label_fontsize) axes[i].set_ylabel(ylabel='value', fontsize=self.label_fontsize) axes[i].tick_params(axis='both', which='maj', labelsize=self.tick_fontsize) axes[i].legend(loc='lower right') fig.subplots_adjust(wspace=0.5, hspace=0.3, left=0.125, right=0.9, top=0.9, bottom=0.1) fig.tight_layout() plt.show() return axes
final_error[l, :] = data[-1]; print data[-1] reproj_errors.append(final_error) for l in range(0, len(datas_extr)): data = datas_extr[l] final_extr_error[l, :] = numpy.sqrt(data[8] ** 2 + data[9] ** 2 + data[10] ** 2) * 100 extr_errors.append(final_extr_error); for l in range(0, len(datas_joints)): data = datas_joints[l] final_joint_error[l * d : l * d + d, :] = numpy.median(numpy.abs(data[:, 0:d] - data[:, d:(2 * d)]))* (180 / 3.14159); joint_errors.append(final_joint_error.flatten()) f1 = plt.figure(); sns.boxplot(data=reproj_errors, color=(0.5, 0.6, 1.0, 0), linewidth=0.5, saturation=0.1, fliersize=1); plt.ylim([0, 8]) plt.xticks(xes, dofs) plt.xlabel('\# Degrees of Freedom') plt.ylabel('Median Reprojection Error (pixels)') plt.gca().spines['top'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') f2 = plt.figure(); sns.boxplot(data=extr_errors, color=(0.5, 0.6, 1.0, 0), linewidth=0.5, saturation=0.1, fliersize=1); plt.xticks(xes, dofs) plt.xlabel('\# Degrees of Freedom') plt.ylabel('Extrinsic Error (cm)') plt.gca().spines['top'].set_visible(False)
yt = treated['ach_score'] tr_q = get_quantile(xt, yt, grd, n_epochs=n_epc) reg = untreated.sample(n=untreated.shape[0], replace=False) xt = untreated['exp'] yt = untreated['ach_score'] utr_q = get_quantile(xt, yt, grd, n_epochs=n_epc) for cnt, tau in enumerate(grd): qte_grd[i, cnt] = np.mean(tr_q[cnt]) - np.mean(utr_q[cnt]) mean, std = np.mean(qte_grd, axis=0), np.std(qte_grd, axis=0) print('QTE per requested quantile: mean(std)') for tau, m, s in zip(grd, mean, std): print('$\\tau = ' + str(tau) + ': ' + str(round(m, 2)) + '(' + str(round(s, 2)) + ')' + '$') plt.rcParams["font.family"] = "Times New Roman" plt.rcParams["font.size"] = 18 plt.figure(figsize=(12, 7)) ax = sns.boxplot(data=qte_grd, palette="Blues") ax.set_xticklabels(np.around(np.linspace(0.1, 0.9, 10), decimals=1)) plt.xlabel('$\\tau$') plt.ylabel('Quantile Treatment Effect ') plt.show() ax.figure.savefig('../figures/heterogeneous_qte_' + str(n_rep) + '_' + str(n_epc) + '.pdf')
messages['is_weekend'] = messages['day_of_week'].isin([5,6]).apply(lambda x: 1 if x == True else 0) # Limit to messages sent by me and exclude all messages between me and Alison messages = messages[(messages['sender'] == 'Mark Regan') & (messages['participants_str'] != 'Alison Darcy, Mark Regan')] # Remove messages not responded within 60 seconds # This introduces an issue by right censoring the data (might return to address) messages = messages[messages['time_delay_seconds'] < 60] messages.head(1) fig = plt.figure(figsize=(10,7)) ax = fig.add_subplot(211) order = np.sort(messages['year_month'].unique()) sns.boxplot(x=messages['year_month'], y=messages['time_delay_seconds'], order=order, orient="v", color=colors[5], linewidth=1, ax=ax) _ = ax.set_title('Response time distribution by month') _ = ax.set_xlabel('Month-Year') _ = ax.set_ylabel('Response time') _ = plt.xticks(rotation=30) ax = fig.add_subplot(212) plt.hist(messages['time_delay_seconds'].values, range=[0, 60], bins=60, histtype='stepfilled', color=colors[0]) _ = ax.set_title('Response time distribution') _ = ax.set_xlabel('Response time (seconds)') _ = ax.set_ylabel('Number of messages') plt.tight_layout() # excluded some colums from csv output messages.drop(['participants', 'message', 'participants_str'], axis=1, inplace=True)
def plot_results(transformation): res_dir = '../results' _, dir_sigmas, _ = next(os.walk(res_dir)) dir_sigmas = [ds for ds in dir_sigmas if ds.find(transformation) == 0] sigmas = [float(ds[len(transformation) + 1:]) for ds in dir_sigmas] idx_sigmas = np.argsort(sigmas) sigmas = [sigmas[i] for i in idx_sigmas] dir_sigmas = [dir_sigmas[i] for i in idx_sigmas] sigma_miss_err = {} sigma_times = {'PM': {}, 'NMU': {}, 'TOTAL': {}} example_miss_err = {} res_files = ['{}/{}/test.txt'.format(res_dir, ds) for ds in dir_sigmas] # Very crude parser, do not change console printing output # or this will break for s, rf in zip(sigmas, res_files): with open(rf, 'r') as file_contents: sigma_miss_err[s] = [] sigma_times['PM'][s] = [] sigma_times['NMU'][s] = [] sigma_times['TOTAL'][s] = [] for i, line in enumerate(file_contents): if line.find('Statistics') == 0: break if i % 10 == 0: example = line[:-5] if i % 10 == 3: t = float(line.split()[4]) sigma_times['PM'][s].append(t) if i % 10 == 4: t = float(line.split()[2]) sigma_times['NMU'][s].append(t) if i % 10 == 7: t = float(line.split()[2]) sigma_times['TOTAL'][s].append(t) if i % 10 == 8: pr = 100 * float(line.split()[3][:-1]) if example not in example_miss_err: example_miss_err[example] = [] example_miss_err[example].append(pr) sigma_miss_err[s].append(pr) def sort_dict(d): return collections.OrderedDict(sorted(d.items())) example_miss_err = sort_dict(example_miss_err) sigma_miss_err = sort_dict(sigma_miss_err) sigma_times['PM'] = sort_dict(sigma_times['PM']) sigma_times['NMU'] = sort_dict(sigma_times['NMU']) sigma_times['TOTAL'] = sort_dict(sigma_times['TOTAL']) def round2(vals, decimals=2): return np.round(vals, decimals=decimals) print('Misclassification error') for key in sigma_miss_err: values = np.array(sigma_miss_err[key]) stats = (key, round2(np.mean(values)), round2(np.median(values)), round2(np.std(values, ddof=1))) fmt_str = 'sigma: {}\tmean: {}\tmedian: {}\tstd: {}' print(fmt_str.format(*stats)) # print('\t', values) with sns.axes_style("whitegrid"): values = np.array(list(sigma_miss_err.values())).T max_val = values.max() plt.figure() sns.boxplot(data=values, color='.95', whis=100) sns.stripplot(data=values, jitter=True) sigmas_text = ['{:.2f}'.format(s) for s in sigmas] plt.xticks(range(len(sigmas)), sigmas_text, size='x-large') yticks = [yt for yt in plt.yticks()[0] if yt >= 0] plt.yticks(yticks, size='x-large') plt.xlabel(r'$\sigma$', size='x-large') plt.ylabel('Misclassification error (%)', size='x-large') plt.ylim((-2, 10 * np.ceil(max_val / 10))) if transformation == 'homography': plt.title('Homographies', size='x-large') if transformation == 'fundamental': plt.title('Fundamental matrices', size='x-large') plt.tight_layout() plt.savefig('{}/{}_result.pdf'.format(res_dir, transformation), bbox_inches='tight') print('Time') for key in sigma_miss_err: mean_PM = round2(np.mean(np.array(sigma_times['PM'][key]))) mean_NMU = round2(np.mean((np.array(sigma_times['NMU'][key])))) mean_total = round2(np.mean((np.array(sigma_times['TOTAL'][key])))) stats = (key, mean_total, round2(mean_PM / mean_total), round2(mean_NMU / mean_total)) fmt_str = 'sigma: {}\tTOTAL: {}\tRATIO PM: {}\tRATIO NMU: {}' print(fmt_str.format(*stats))
show() peace_age = peace_df['Age Category'].value_counts() print(peace_age) plt.pie(peace_age, labels=peace_age.index, autopct='%1.1f%%') plt.show() sns.jointplot(x="Year", y="Age", kind='reg', data=data) plt.show() sns.boxplot(data=data, x='Category', y='Age') plt.show() sns.lmplot('Year','Age',data=data,lowess=True, aspect=2, line_kws={'color' : 'black'}) plt.show() # Question 2: What words are most frequently written in the prize motivation? top_N = 10 stopwords = nltk.corpus.stopwords.words('english') re_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) words = (data['Motivation'] .str.lower() .replace([r'\|', re_stopwords], [' ', ' '], regex=True) .str.cat(sep=' ')