def plot_feats (df, cols,target, hue): """method for plotting relationship of target with high correlated variables using or not a hue cols = list of features to plot""" if hue in cols: cols.remove(hue) if target in cols: cols.remove(target) sns.reset_defaults() sns.set(style="ticks", color_codes=True) # fig = plt.figure(figsize = (15,10)) sns.set(font_scale= 1.0) if hue == None: fig_s = plt.figure(figsize = (15,25)) for i, c in enumerate(cols): fig_i= fig_s.add_subplot(420 + i + 1) sns.scatterplot(x = df[c], y = df[target], palette= 'Spectral') plt.show() else: # Box plot hue/target fig = plt.figure(figsize = (15,10)) fig_1 = fig.add_subplot(221) sns.boxplot(x=hue, y=target, data=df[[target, hue]]) plt.show() fig_s = plt.figure(figsize = (15,25)) for i, c in enumerate(cols): fig_i= fig_s.add_subplot(420 + i + 1) sns.scatterplot(x = df[c], y = df[target], hue=df[hue], palette= 'Spectral') plt.show()
def ramachandran_plot(atomgroup, selection, outputfile1, outputfile2, image_format='png'): # plot standard mdanalysis and seaborn 2D with kde R = Ramachandran(atomgroup).run() fig, ax = plt.subplots(figsize=plt.figaspect(1)) R.plot(ax=ax, color='k', marker='.', ref=True) a = R.angles.reshape(np.prod(R.angles.shape[:2]), 2) # open hdf file with h5py.File(args.o_data1, 'a') as f: setname = "%s" % (selection) f["/" + setname + "/ramachandran/phi"] = a[:, 0] f["/" + setname + "/ramachandran/psi"] = a[:, 1] plt.tight_layout() # svg is better but sticking with png for now plt.savefig(outputfile1, format=image_format) sns.reset_defaults() importlib.reload(plt) importlib.reload(sns) with sns.axes_style("white"): h = sns.jointplot(x=a[:, 0], y=a[:, 1], kind="kde", space=0) h.set_axis_labels(r'$\phi$ (deg)', r'$\psi$ (deg)') h.ax_joint.set_xlim(-180, 180) h.ax_joint.set_ylim(-180, 180) h.ax_joint.xaxis.set_major_locator(ticker.MultipleLocator(60)) h.ax_joint.yaxis.set_major_locator(ticker.MultipleLocator(60)) plt.savefig(outputfile2, format=image_format, bbox_inches='tight')
def plot_forecasts(self, series, forecasts, test): n_test = test.shape[0]+2 sns.set() # plot the entire dataset in blue warnings.filterwarnings("ignore") plt.figure(0,figsize=[12,6]) plt.plot(series.values, label='True time-series') # if self._n_seq == 1: # plot the forecasts for i in range(len(forecasts)): off_s = len(series) - n_test + i off_e = off_s + len(forecasts[i]) xaxis = [x for x in range(off_s, off_e)] if i==0: lbs = 'Forecast + uncertainty score (std)' else: lbs = None if self._n_seq>1: plt.errorbar(x=xaxis, y=forecasts[i], yerr=self._stds[i], linestyle='None', marker='^', color='r', label=lbs) else: plt.errorbar(x=xaxis, y=forecasts[i], yerr=self._stds[i], linestyle='None', marker='^', color='r',label=lbs) plt.legend() # show the plot plt.title('Forecasting in testing set of time-series') plt.xlabel('timestep') plt.ylabel('Value') plt.show() sns.reset_defaults()
def setsea(self): if self.ch.isChecked(): sns.set() else: sns.reset_defaults() plt.style.use(self.cb.currentText()) self.setGraph()
def make_fuzziness_histo(self, distance_list_series, plot_name): sns.set(style="darkgrid") sns.set_color_codes() sns.distplot(distance_list_series.dropna(), norm_hist=True, color="r") plt.savefig(plot_name) sns.reset_defaults() sns.reset_orig() plt.clf()
def plotMu(mus, cantonPop): sns.reset_defaults() sns.set(rc={"figure.figsize": (7, 5)}, style="white") # nicer layout ax = sns.histplot(mus, kde=False) ax.set(xlabel="mu", ylabel="count", title="Mus for all Cantons") sns.despine() ax = sns.scatterplot(x=mus, y=cantonPop) ax.set(xlabel="mu", ylabel="canton population", title="Canton Population vs Mu") sns.despine() plt.show()
def hist(data, bins=10, spacing=True, axis="on"): import matplotlib.pyplot as plt if spacing: import seaborn as sns sns.set(rc={'figure.figsize': (11.7, 8.27)}) fig, axs = plt.subplots(1, 1, sharey=True) # We can set the number of bins with the `bins` kwarg plt.axis(axis) axs.hist(data, bins=bins) if spacing: sns.reset_defaults()
def _report_pipeline_set_confusion_matrix(self, pipeline_name: str, perf: ClassificationPerformance, set_: Set): sns.reset_defaults() cm = ConfusionMatrixDisplay( confusion_matrix(perf.labels, perf.predictions, labels=perf.unique_labels), display_labels=perf.unique_labels, ) cm.plot(cmap=plt.cm.Blues, values_format=".4g") self._mf.figure(cm.figure_, f"{pipeline_name}_{set_}_cm.png") sns.set()
def plot_resid(resid, resid_test, folder_path): ''' creates plots of mean and standard devations of training and testing Parameters ---------- resid: array of observations - gmpe predictions for training data resid_test: array of observations - gmpe predictions for testing data folder_path: path for saving png files Returns ------- creates pngs of standard deviation of residuals and average of residuals ''' import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", color_codes=True) sns.reset_defaults() sns.set_style('whitegrid') sns.set_context('talk') sns.set_context(context='talk',font_scale=0.7) period=[10,7.5,5,4,3,2,1,0.5,0.2,0.1] diff=np.std(resid,axis=0) difftest=np.std(resid_test,axis=0) f22=plt.figure('Difference Std of residuals vs Period') plt.semilogx(period,diff,label='Training ') plt.semilogx(period,difftest,label='Testing') plt.xlabel('Period') plt.ylabel('Total Standard Deviation') plt.legend() plt.ylim(.25,.85) plt.savefig(folder_path + 'resid_T.png') plt.show() diffmean=np.mean(resid,axis=0) diffmeantest=np.mean(resid_test,axis=0) f22=plt.figure('Difference Std of residuals vs Period') plt.semilogx(period,diffmean,label='Training') plt.semilogx(period,diffmeantest,label='Testing') plt.xlabel('Period') plt.ylabel('Mean residual') plt.legend() plt.savefig(folder_path + 'mean_T.png') plt.show() plt.close('all')
def target_correlation_plot(dframe): """ It plots a bar graph between target column and correlation values of all other dimensions with the target column. This visualization is chosen because even the processed dataframe for the problem contains 518 feature columns. So many plots like pairplot, correlation matrix plot, etc would become very huge and impossible to render. :param dframe: dataframe to visualize :return: an object of seaborn figure """ sns.set(rc={'figure.figsize': (7, 100)}) sns.set(font_scale=0.6) figure = sns.barplot(dframe.corr()[constants.RESULT_COLUMN_NAME], preprocess.get_headers(dframe)).get_figure() sns.reset_defaults() return figure
def plot_plant_metrics(metrics: pd.DataFrame, sens_vars: Collection[str], act_vars: Collection[str], out_path: str, fname_prefix: str = ''): out_path = Path(out_path) assert out_path.is_dir() metrics = metrics.copy() # use relative timestamp metrics['timestamp'] = metrics['timestamp'] - metrics['timestamp'].min() sns.set_theme(context='paper', palette='Dark2') with sns.color_palette('Dark2', len(metrics.columns)) as colors: colors = iter(colors) # sensor readings fig, ax = plt.subplots(nrows=len(sens_vars), sharex='all', squeeze=False) for ax, var in zip(ax, sens_vars): __plot_raw_proc_values(df=metrics, var_name=var, prefix='sens_', colors=colors, proc_label='Sensor Reading', ax=ax.item()) fig.suptitle('Monitored values & sensor readings') fig.tight_layout() fig.savefig(out_path / f'{fname_prefix}_sensors.png') # actuator outputs fig, ax = plt.subplots(nrows=len(act_vars), sharex='all', squeeze=False) for ax, var in zip(ax, act_vars): __plot_raw_proc_values(df=metrics, var_name=var, prefix='act_', colors=colors, proc_label='Actuator Output', ax=ax.item()) fig.suptitle('Actuated values & actuator outputs') fig.tight_layout() fig.savefig(out_path / f'{fname_prefix}_actuators.png') plt.close('all') sns.reset_defaults()
def plot_rawinputs(x_raw, mean_x_allT, y, feature_names, period, folder_path): ''' plots model predictions vs. raw (untransformed) input features Parameters ---------- x_raw: numpy array of untransformed data mean_x_test_allT: 2d array of model predictions for data y: 2d array numpy array of targets feature_names: array or list of feature names period: list of periods folder_path: path for saving png files Returns ------- creates png scatterplots of predicted ground motions vs. each input feature (before transformation) ''' import matplotlib.pyplot as plt import numpy as np import os import seaborn as sns sns.set(style="ticks", color_codes=True) sns.reset_defaults() sns.set_style('whitegrid') sns.set_context('talk') sns.set_context(context='talk',font_scale=0.7) folderlist = ['T10s','T7_5s','T5s','T4s','T3s','T2s','T1s','T_5s','T_2s','T_1s'] for j in range(len(period)): mean_x_test = mean_x_allT[:,j:j+1].flatten() if not os.path.exists(folder_path + folderlist[j]): os.makedirs(folder_path + folderlist[j]) for i in range(len(x_raw[0])): fig, axes = plt.subplots(2,1,figsize=(10,8)) plt.title('T = ' + str(period[j]) + ' s') ylim = max(np.abs(y[:,j])) axes[0].set_ylim(-1*ylim,ylim) axes[1].set_ylim(-1*ylim,ylim) axes[0].scatter(x_raw[:,i], mean_x_test,s=1, label='predictions', color='blue') axes[1].scatter(x_raw[:,i], y[:,j], s=1, label='targets', color='green') axes[1].set_xlabel(feature_names[i]) axes[0].set_ylabel('prediction') axes[1].set_ylabel('target') axes[0].legend(loc = 'upper left') axes[1].legend(loc = 'upper left') plt.savefig(folder_path + folderlist[j] + '/predictions_vs_' + feature_names[i] + '.png') plt.show()
def plot_controller_network_metrics(metrics: pd.DataFrame, out_path: str, fname_prefix: str = ''): out_path = Path(out_path) assert out_path.is_dir() # plot processing time distributions and rates metrics = metrics.copy() metrics['process_time'] *= 1000.0 metrics['timestamp'] = \ metrics['recv_timestamp'] - metrics['recv_timestamp'].min() sns.set_theme(context='paper', palette='Dark2') with sns.color_palette('Dark2') as colors: colors = iter(colors) fig, ax = plt.subplots(nrows=2) sns.histplot(data=metrics, x='process_time', stat='density', kde=True, color=next(colors), ax=ax[0]) ax[0].set_title('Distribution of sample processing times.') ax[0].set_xlabel('Processing time (bins) [ms]') __plot_rate_per_time_unit(df=metrics, x='timestamp', timestamp='recv_timestamp', ax=ax[1], label='Receive rate', color=next(colors)) __plot_rate_per_time_unit(df=metrics, x='timestamp', timestamp='send_timestamp', ax=ax[1], label='Send rate', color=next(colors)) ax[1].set_xlabel('Time [s]') ax[1].set_ylabel('Packets / second') ax[1].set_title('Packet rates over time.') fig.tight_layout() fig.savefig(out_path / f'{fname_prefix}controller_metrics.png') plt.close('all') sns.reset_defaults()
def obs_pre(y_train, y_test, pre, pre_test, period, folder_path): ''' creates scatterplots of observed ground motion residuals vs. model predicted ground motion data for training and testing data Parameters ---------- y_train: 2d numpy array of observed ground motion residuals for training data y_test: 2d numpy array of observed ground motion residuals for testing data pre: numpy array of model predictions for training data pre_test: numpy array of model predictions for testing data period: list of periods folder_path: path for saving png files Returns ------- creates png scatterplots of observed ground motions vs. predicted for each period ''' import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns sns.set(style="ticks", color_codes=True) sns.reset_defaults() sns.set_style('whitegrid') sns.set_context('talk') sns.set_context(context='talk',font_scale=0.7) for i in range(len(period)): T= period[i] y = pre.T[i] x = y_train.T[i] y_testplot = pre_test.T[i] x_test = y_test.T[i] plt.figure(figsize = (6,6)) lim = np.max(np.asarray([abs(x), abs(y)]).flatten()) plt.scatter(x,y,s=1,label='Training') plt.scatter(x_test,y_testplot,s=1,label='Testing') plt.xlabel('observed') plt.ylabel('predicted') plt.title('T ' + str(T) + ' s') plt.xlim(-1*lim, lim) plt.ylim(-1*lim, lim) plt.legend() plt.savefig(folder_path + 'obs_pre_T_' + str(T) + '.png') plt.show() plt.close('all')
def delta_plot(delta_df, x, y, name, minmax=True, hline=[-0.5, 0.5], vline=[-3, 3]): xy_df = delta_df[delta_df['measure'] == x].melt('measure') y_df = delta_df[delta_df['measure'] == y].melt('measure') xy_df['value2'] = y_df['value'].values xy_df['type'] = xy_df['variable'].apply(lambda x: x.split('_')[0]) xy_df['depth'] = xy_df['variable'].apply( lambda x: float(x.split('_')[1])).astype('float') xy_df['depth'] = xy_df['variable'].apply( lambda x: float(x.split('_')[1])).astype('float') sns.reset_defaults() # unique_tags = xy_df['variable'].unique() # p = sns.cubehelix_palette(len(unique_tags), light=.8, start=.5, rot=-.75) # ax = sns.scatterplot(x='value', y='value2', hue='variable', style='type', palette=p, data=xy_df) ax = sns.scatterplot(x='value', y='value2', hue='depth', style='type', legend='brief', data=xy_df) ax.set_xlabel('delta ' + x.replace('_vec', '')) ax.set_ylabel('delta ' + y.replace('_vec', '')) if minmax: ax.hlines(0, xy_df['value'].min() - .01, xy_df['value'].max() + .01) ax.set_ylim(xy_df['value'].min() - .01, xy_df['value'].max() + .01) ax.vlines(0, xy_df['value2'].min() - .01, xy_df['value2'].max() + .01) ax.set_ylim(xy_df['value2'].min() - .01, xy_df['value2'].max() + .01) else: ax.hlines(0, *hline) ax.set_xlim(*hline) ax.vlines(0, *vline) ax.set_ylim(*vline) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) ax.set_title('{}, d{} vs d{}'.format(name, x, y)) return xy_df, ax
def plotR0(R0: np.ndarray, cantonNames: list, ax=None): sns.reset_defaults() sns.set(rc={"figure.figsize": (7, 5)}, style="white") # nicer layout if ax is not None: ax.set(ylim=(0, 5)) sns.lineplot(data=R0.T, ax=ax, legend=None, dashes=False, palette=colors) else: ax = sns.lineplot(data=R0.T, palette=colors, dashes=False) ax.set(ylim=(0, 5)) ax.legend( cantonNames, frameon=False, bbox_to_anchor=(1.0, 1), loc="upper left", fontsize="xx-small", ) ax.set(xlabel="day", ylabel="R0", title="R0 over time per Canton") ax.axhline(1, color="grey", dashes=[6, 2]) sns.despine()
def plot_confusion_matrix(predicted_Y_loaded_model, true_Y, saving_path, recall=0, precision=0, f1_score=0): predicted_Y_loaded_model = list(map( np.round, predicted_Y_loaded_model)) #tf.math.round(predicted_Y_loaded_model) cm = confusion_matrix(true_Y, predicted_Y_loaded_model) plt.figure(figsize=(5, 5)) seaborn.heatmap(cm, annot=True, fmt="d") plt.title("recall:" + str(recall) + ", precision:" + str(precision) + ",f1_score:" + str(f1_score)) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.savefig(saving_path) plt.clf() seaborn.reset_defaults()
def plot_confusion_matrix(y_true, y_pred, labels=None, title=None, save_dir=None, is_percentage=False, is_show=False): """ 绘制混淆矩阵 :param y_true: (list or numpy.array) 标签 :param y_pred: (list or numpy.array) 预测值 :param labels: (list) 标签列表, 默认是二分类, 即(0, 1) :param title: (str) 标题, 默认是None :param save_dir: (str) 保存目录, 默认是None :param is_percentage: (bool) 是否以百分比的形式, 默认是Fasle :param is_show: (bool) 是否展示, 默认是Fasle :return: """ y_true = data_transform(y_true) y_pred = data_transform(y_pred) if title is None: title = "confusion matrix" matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels) if is_percentage: matrix = (matrix.T / np.sum(matrix, axis=1)).T fmt = ".4g" else: fmt = ".20g" plt.figure() sns.set() f, ax = plt.subplots() sns.heatmap(matrix, annot=True, ax=ax, fmt=fmt) ax.set_title(title) ax.set_xlabel("predict") ax.set_ylabel("true") if save_dir is not None: plt.savefig(os.path.join(save_dir, f"{title}.png")) if is_show: plt.show() plt.close() sns.reset_defaults()
def plot_time_and_percentile(results: dict, filename: str, unit: str): plt.cla() plt.clf() sns.set(style="whitegrid") fig, (ax1, ax2) = plt.subplots(2) plt.gcf().set_size_inches((6.4, 9.6)) ax1.set_title("timedata") ax2.set_title("percentile") ax1.set(xlabel='iteration') ax1.set(ylabel='time ({})'.format(unit)) ax2.set(xlabel='n-th percentile') ax2.set(ylabel='time ({})'.format(unit)) p = np.arange(100) percentile, data = results['percentile'], results['data'] median, mean = results['median'], results['mean'] mean_x = np.abs(percentile - mean).argmin() p_90 = percentile[90] timedata = pd.DataFrame(data, index=range(len(data)), columns=["timedata"]) sns.lineplot(data=timedata, palette="tab10", linewidth=0.15, ax=ax1) ax2.plot(p, percentile, label='percentile') ax2.scatter(50, median, linestyle=':', label='median ({0:.2f})'.format(median)) ax2.scatter(90, p_90, linestyle=':', label='90-th percentile ({0:.2f})'.format(p_90)) ax2.scatter(np.abs(percentile - mean).argmin(), mean, label='mean ({0:.2f})'.format(mean)) ax2.legend() plt.autoscale() plt.tight_layout() plt.savefig(filename) plt.gcf().set_size_inches( (6.4, 4.8)) ## reset back to matplotlib default sns.reset_defaults()
def plot_forecasts(self, series, forecasts, test): n_test = test.shape[0] + 2 sns.set() # plot the entire dataset in blue warnings.filterwarnings("ignore") plt.figure(0, figsize=[12, 6]) plt.plot(series.values, label='True time-series') # if self._n_seq == 1: # plot the forecasts for i in range(len(forecasts)): off_s = len(series) - n_test + i off_e = off_s + len(forecasts[i]) xaxis = [x for x in range(off_s, off_e)] if i == 0: lb = 'Forecasted time-series' else: lb = None if self._n_seq > 1: sns.lineplot(x=xaxis, y=forecasts[i], label=lb, color='r', hue_order=False) else: sns.scatterplot(x=xaxis, y=forecasts[i], label=lb, color='r', hue_order=False) #plt.plot(xaxis, forecasts[i], color='red',label='Forecasted time-series') # show the plot plt.title('Forecasting in testing set of time-series') plt.xlabel('timestep') plt.ylabel('Value') plt.show() sns.reset_defaults()
def plot(self, output_directory, *args, **kwargs): results = self.report() output_directory = Path(output_directory) output_directory = Path(output_directory) output_filename = output_directory / '{}.png'.format(self.name) plt.cla() plt.clf() sns.set(style="whitegrid") fig, (ax1, ax2) = plt.subplots(2) plt.gcf().set_size_inches((6.4, 9.6)) cpu_data_array = np.asarray(self.cpu_percent_data) n_cpu = 1 if len( cpu_data_array.shape) == 1 else cpu_data_array.shape[-1] columns = ['cpu{}'.format(i) for i in range(n_cpu)] if n_cpu > 1 else ['cpu'] linewidth = 0.75 cpu_data = pd.DataFrame(cpu_data_array, columns=columns) sns.lineplot(data=cpu_data, palette="tab10", linewidth=linewidth, dashes=False, ax=ax1) ax1.set(xlabel='time (x{0:.2f}s)'.format(self.dt), ylabel='Utilization (%)') ax1.set_title("CPU Utilization (%)") ax2.set_title("CPU Utilization (%)") ax2.boxplot(cpu_data_array, showfliers=False) plt.autoscale() plt.tight_layout() plt.savefig(output_filename) plt.gcf().set_size_inches( (6.4, 4.8)) ## reset back to matplotlib default sns.reset_defaults() return {'cpu_percent': output_filename}
def make_density_histo(self): print('Making histogram...') mean_density = self.filtered_density_pandas['Density'].mean() min_val = mean_density - (3 * self.st_dev_density) max_val = mean_density + (3 * self.st_dev_density) all_density_list = self.filtered_density_pandas['Density'].to_list() # filtered to within 3 stdevs, as otherwise plot is a bit meh filtered_densities = [ a for a in all_density_list if min_val < a < max_val ] filtered_density_series = pd.Series(data=filtered_densities, name="Points per sq m") sns.set(style="darkgrid") sns.set_color_codes() # plt.axes(xbound=(0, 100)) sns.distplot(filtered_density_series.dropna(), norm_hist=True, color="r") plt.savefig(self.histo_out) sns.reset_defaults() sns.reset_orig() plt.clf()
def createChartsForUnknown(): sns.reset_defaults() plt.clf() print "\nCreating charts for Unknown...", sql = "SELECT Component, count(*) FROM Tests WHERE Author = 'UnKnown' and StreamId = " + str( streamid ) + " and Date BETWEEN '" + StartDate + "' AND '" + EndDate + "' group by Component;" c.execute(sql) data = c.fetchall() if len(data) == 0: print "No Test added prior to " + StartDate df2 = pd.DataFrame.from_records(data, columns=['Component', 'Count']) X = np.array(df2.Component) Y = np.array(df2.Count) size = np.shape(X)[0] sns.set_style("whitegrid") colors = sns.color_palette("cubehelix", len(df2.Component.unique()) + 5) # print size for i in range(size): g = sns.barplot(y=X[i:i + 1], x=Y[i:i + 1], color=colors[i], order=X, url=X[i] + '-unknown.html', orient='h') g.text(Y[i] + 0.5, i, Y[i], color='black', ha="center", weight="bold") g.tick_params(labelsize=20) # sns.despine(left=True) plt.title("Tests added by Unknown Authors between " + StartDate + " and " + EndDate, fontsize=20, fontweight=0.5, color='Black') plt.savefig("Report/" + 'Unknown.svg', dpi=300, bbox_inches="tight") for i in range(size): sql = "SELECT Date, Component, Test, TestCase, File FROM Tests WHERE Author = 'UnKnown' and StreamId = " + str( streamid ) + " and Component='" + X[ i] + "' AND Date BETWEEN '" + StartDate + "' AND '" + EndDate + "';" c.execute(sql) data = c.fetchall() if len(data) == 0: print "Unable to get data fo unknown test " + X[i] df2 = pd.DataFrame.from_records( data, columns=['Date', 'Component', 'Test', 'TestCase', 'File']) # print df2.columns htmlString = '<table style="width: 50%;" border="3" cellpadding="20"><tbody><tr style="font-weight: bold; background-color: black; color: white;"><td>Index</td><td>Date</td><td>Component</td><td>Test</td><td>Test Case</td><td>File</td></tr>' for j, row in df2.iterrows(): htmlString += "<tr><td>" + str( j + 1) + "</td>" + "<td>" + row.values[ 0] + "</td>" + "<td>" + row.values[ 1] + "</td>" + "<td>" + row.values[ 2] + "</td>" + "<td>" + row.values[ 3] + "</td>" + "<td>" + row.values[ 4] + "</td></tr>" htmlString += "</tbody></table>" html_file = open("Report/" + X[i] + '-unknown.html', 'w') html_file.write(htmlString) html_file.close() print ".",
def plot_cv_results(in_cvresult, mname, save_fig=False): plt.close('all') fig, ax = plt.subplots(1, 1, figsize=[16, 9]) # Extract scores of best alpha parameter and drop training scores in_cvresult = in_cvresult.copy() in_cvresult['best_alpha'] = ((in_cvresult.set_index('param_alpha').groupby( 'param_encode')['mean_test_score'].transform('idxmax').rename( 'best_alpha').reset_index(drop=True))) in_cvresult = (in_cvresult.query("param_alpha == best_alpha").drop( columns=[ 'param_alpha', 'best_alpha', 'mean_train_score', 'std_train_score' ])) for name_old, name_new in [('atchley_cluster', 'Atchley clust.'), ('atchley', 'Atchley'), ('onehot', 'One-Hot'), ('reduced_alphabet', 'Reduced Alphabet'), ('word_embedding_cluster', 'Word2Vec Clust.'), ('word_embedding', 'Word2Vec'), ('elmo_embedding_summed', 'ELMo summed'), ('elmo_embedding', 'ELMo')]: in_cvresult['param_encode'].replace(name_old, name_new, inplace=True) in_cvresult = in_cvresult.loc[[3, 0, 14, 13, 4, 6, 11, 8]] sns.set() sns.reset_defaults() plt.close('all') fig, ax = plt.subplots(figsize=[10, 7]) ax: plt.Axes plt.plot('param_encode', 'mean_test_score', 'b.', markersize=25, data=in_cvresult) plt.errorbar('param_encode', 'mean_test_score', 'std_test_score', linewidth=4, data=in_cvresult, capsize=10, capthick=4) plt.xticks(rotation=10) ax.grid() ax.set_yticks( np.arange(round(ax.get_ylim()[0], 2), round(ax.get_ylim()[1], 2) + 0.01, 0.01)) ax.set_xlabel('Encoding') ax.set_ylabel('Accuracy') [ ax.text(x - 0.4, y_, f"{s:.3f}") for x, y_, s in zip( range(len(in_cvresult['param_encode'])), in_cvresult.mean_test_score, in_cvresult.mean_test_score) ] if save_fig: fig.tight_layout() fig.savefig(f"paper/figures/CV_score_{mname}") else: plt.show()
def plot_fit_3D(fitted_model, column1, column2, data=None, points=100, scolor='C3', fcolor='C0', cicolor='C1', salpha=0.4, cialpha=0.2, cmap='Oranges', figsize=(12, 9), show_ci=True): """Produce 3D scatter plot and overlay fitted model surface. Make a 3D scatter plot of the response versus two specified predictors and overlay the fit result surface. The distributions of the other predictors are marginalised out, ie. they are set to the mean values of their respective distributions. NOTE: This resets matplotlib graphics options to the defaults. Returns the matplotlib figure and Axes3D objects. """ model = fitted_model.model if data is None: data = pd.DataFrame(model.exog, columns=model.exog_names) marg = utils.marginalised_range((column1, column2), data, points=points) sns.reset_defaults() fig = plt.figure(figsize=figsize) ax = axes3d.Axes3D(fig) # prepare point grids from the ranges of the scatter plot xs = marg[column1] ys = marg[column2] xv, yv = np.meshgrid(xs, ys) zv = np.zeros((ys.size, xs.size)) lv = np.zeros((ys.size, xs.size)) uv = np.zeros((ys.size, xs.size)) # compute predictions and CI bounds for the rows in the point grids for idx, y in enumerate(yv): marg[column2] = y pred = fitted_model.get_prediction(marg).summary_frame() zv[idx] = pred['mean'] lv[idx] = pred['mean_ci_lower'] uv[idx] = pred['mean_ci_upper'] # 3D scatter plot of the raw data ax.scatter(data[column1], data[column2], model.endog, color=scolor) # plot the prediction & CI boundary surfaces ax.plot_surface(xv, yv, zv, alpha=salpha, color=fcolor) if show_ci: ax.plot_surface(xv, yv, lv, alpha=cialpha, color=cicolor) ax.plot_surface(xv, yv, uv, alpha=cialpha, color=cicolor) # add contour plot of the CI width to the bottom of the figure ax.contourf(xv, yv, uv - lv, zdir='z', offset=ax.get_zlim()[0], levels=50, antialiased=True, alpha=cialpha * 2, cmap=cmap) ax.set_xlabel(column1) ax.set_ylabel(column2) ax.set_zlabel(model.endog_names) try: fig.suptitle(f'Fit vs {column1} & {column2}\n{model.formula}') except AttributeError: fig.suptitle(f'Fit vs {column1} & {column2}') return fig, ax
@author: raulv """ import Windprof2 as wp import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sbn import numpy as np import pandas as pd from matplotlib.gridspec import GridSpecFromSubplotSpec as gssp from rv_utilities import add_colorbar, discrete_cmap from datetime import datetime import Meteoframes as mf sbn.reset_defaults() from matplotlib import rcParams rcParams['xtick.major.pad'] = 3 rcParams['ytick.major.pad'] = 3 rcParams['xtick.labelsize'] = 15 rcParams['ytick.labelsize'] = 15 rcParams['axes.labelsize'] = 15 rcParams['legend.handletextpad'] = 0.1 rcParams['legend.handlelength'] = 1. rcParams['legend.fontsize'] = 15 rcParams['mathtext.default'] = 'sf' def cosd(array): return np.cos(np.radians(array))
def sns_reset(): """Call this function to toggle back to the sns plotting environment from the matplotlib environment.""" sns.reset_defaults() sns.set_style("white") sns.set_style("ticks") sns.set_context("notebook")
def create_ANN(x_train, y_train, x_test, y_test, feature_names, numlayers, units, epochs, transform_method, folder_pathmod): ''' build, compiles, and fits ANN model saves trained model files with keras saves error figure and model details text file Parameters ---------- x_train: 2d numpy array of transformed training data y_train: 2d numpy array of training targets x_test: 2d numpy array of transformed testing data y_test: 2d numpy array of testing targets feature_names: array or list of feature names numlayers: integer for number of layers units: list of hidden units per layer epochs: integer number of epochs transform_method: name of transformation method of model details file folder_pathmod: path for saving png files and model detail text file Returns ------- resid_train: array of observations - gmpe predictions for training data resid_test: array of observations - gmpe predictions for testing data pre_train: 2d array of model predictions for training data pre_test: 2d array of model predictions for testing data ''' import numpy as np import pandas as pd from keras.models import Sequential import matplotlib as mpl import matplotlib.pyplot as plt from keras import layers from keras import optimizers import tensorflow.compat.v2 as tf tf.enable_v2_behavior() import seaborn as sns sns.set(style="ticks", color_codes=True) sns.reset_defaults() sns.set_style('whitegrid') sns.set_context('talk') sns.set_context(context='talk', font_scale=0.7) batch_size = 256 def build_model(): model = Sequential() model.add( layers.Dense(units[0], activation='sigmoid', input_shape=(x_train.shape[1], ))) for i in range(1, numlayers): model.add( layers.Dense(units[i]) ) #add sigmoid aciivation functio? (only alues betwen 0 and 1) model.add( layers.Dense(y_train.shape[1]) ) #add sigmoid aciivation functio? (only alues betwen 0 and 1) model.compile(optimizer=optimizers.Adam(lr=2e-3), loss='mse', metrics=['mae', 'mse']) return model model = build_model() # fit the model history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, batch_size=batch_size, verbose=1) model.save(folder_pathmod + 'model') mae_history = history.history['val_mae'] mae_history_train = history.history['mae'] test_mse_score, test_mae_score, tempp = model.evaluate(x_test, y_test) # dataframe for saving purposes hist_df = pd.DataFrame(history.history) f10 = plt.figure('Overfitting Test') plt.plot(mae_history_train, label='Training Data') plt.plot(mae_history, label='Testing Data') plt.xlabel('Epoch') plt.ylabel('Mean Absolute Error') plt.title('Overfitting Test') plt.legend() print(test_mae_score) plt.grid() plt.savefig(folder_pathmod + 'error.png') plt.show() pre_test = np.array(model.predict(x_test)) pre_train = np.array(model.predict(x_train)) # test data mean_x_test_allT = pre_test # training data mean_x_train_allT = pre_train resid_train = y_train - mean_x_train_allT resid_test = y_test - mean_x_test_allT diff = np.std(y_train - mean_x_train_allT, axis=0) difftest = np.std(y_test - mean_x_test_allT, axis=0) # write model details to a file file = open(folder_pathmod + 'model_details.txt', "w+") file.write('number training samples ' + str(len(x_train)) + '\n') file.write('number testing samples ' + str(len(x_test)) + '\n') file.write('data transformation method ' + str(transform_method) + '\n') file.write('input feature names ' + str(feature_names) + '\n') file.write('number of epochs ' + str(epochs) + '\n') model.summary(print_fn=lambda x: file.write(x + '\n')) file.write('model fit history' + str(hist_df.to_string) + '\n') file.write('stddev train' + str(diff) + '\n') file.write('stddev test' + str(difftest) + '\n') file.close() # write predictions to a file period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] cols = ['obs_' + str(period[i]) for i in range(len(period)) ] + ['pre_' + str(period[i]) for i in range(len(period))] out = np.concatenate((y_train, pre_train), axis=1) df_out = pd.DataFrame(out, columns=cols) df_out.to_csv(folder_pathmod + 'train_obs_pre.csv') period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] cols = ['obs_' + str(period[i]) for i in range(len(period)) ] + ['pre_' + str(period[i]) for i in range(len(period))] out = np.concatenate((y_test, pre_test), axis=1) df_out = pd.DataFrame(out, columns=cols) df_out.to_csv(folder_pathmod + 'test_obs_pre.csv') return resid_train, resid_test, pre_train, pre_test
# Plot number of reviews for companies company = data["company"].value_counts() # brands.count() plt.figure(figsize=(12,8)) company.plot(kind='bar') plt.title("Number of Reviews for the 6 Companies") """We can see here that amazon has the most number of reviews in the data set , while Netflix hs the least number of reviews submitted.""" # Plot distribution of ratings for each catefgory of ratings given to companies rating_cols = ["overall-ratings", "work-balance-stars", "culture-values-stars", "carrer-opportunities-stars", "comp-benefit-stars", "senior-mangemnet-stars"] sns.reset_defaults() xcol = "company" xlabel = "Company" ylabel = "Count" title = "Vote Count Per Company" nrows = 3 ncols = 2 sns.countplot(x=data[xcol], data=data) #plt.subplot(nrows,ncols, i+1) plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.plot() feature_count = len(rating_cols)
def getVariantRatioTabInFamily(d, max_ratio_cutoff=0.1, mean_read_cutoff=0, draw_fig=False): ''' Some post-ranscriptional modifications on tRNAs will result in mismatches in NGS data The function 1)Generates a tsv file containing details about mismatch information for tRNA families across samples. 2)Create bar charts for the distribution of mutation ratio for each sample. 3)Creates mismatch ratio matrix for tRNA family (mutation location vs samples). @param d: The data object generated by data_loader.py @param tRNA_families: a list of the name of tRNA families which will show in the matrixes, if it empty all avaliable tRNA families will be shown in the matrix @param max_ratio_cutoff: only mismatch sites with mismatch ratio above the cutoff in at least one sample will be repoted @param mean_read_cutoff: The minimal of mean read number across samples to filter out sites with very few reads @param draw_fig: whether draw bar charts for distribution of mutation ratio for each sample @param test: If test is true, only draw three matrixes for testing. @return: None ''' fv = pd.DataFrame( columns=['sample', 'family', 'loc', 'RNA_IDs', 'mem_num', 'members', 'ref', 'muts', 'mut_reads', 'total_reads', 'ratio']) v = pd.read_csv(d["variants"], sep="\t") # Combine mutations for each tRNA, here we just sum the mutation reads and keep total_reads not change. gv = v.groupby(['#SampleID', 'family', 'tRNA_ID', 'loc', 'ref']).sum() gv['mut_reads'] = v.groupby(['#SampleID', 'family', 'tRNA_ID', 'loc', 'ref'])['mut_reads'].sum() gv['total_reads'] = v.groupby(['#SampleID', 'family', 'tRNA_ID', 'loc', 'ref'])['total_reads'].mean() gv['muts'] = v.groupby(['#SampleID', 'family', 'tRNA_ID', 'loc', 'ref'])['mut'].apply(','.join) gv['mut_num'] = v.groupby(['#SampleID', 'family', 'tRNA_ID', 'loc', 'ref'])['mut_reads'].apply( lambda x: ','.join(x.astype(int).astype(str))) gv = gv.reset_index() # print(gv) # Combine mutations for each tRNA family, here we just sum both the mutation reads and total_reads not change. fv = gv.groupby(['#SampleID', 'family', 'loc', 'ref']).sum() fv['mut_reads'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['mut_reads'].sum() fv['total_reads'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['total_reads'].sum() fv['ratio'] = fv['mut_reads'] / fv['total_reads'] fv['muts'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['muts'].apply(','.join) fv['mut_num'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['mut_num'].apply(','.join) fv['tRNA_IDs'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['tRNA_ID'].apply(','.join) fv['tRNA_num'] = gv.groupby(['#SampleID', 'family', 'loc', 'ref'])['tRNA_ID'].count() fv['uni_reads'] = fv['mut_num'].apply(lambda x: len(set(x.split(',')))) fv = fv.reset_index() # Explain for transform https://pbpython.com/pandas_transform.html#:~:text=Understanding%20the%20Transform%20Function%20in%20Pandas%201%20Introduction.,...%204%20Second%20Approach%20-%20Using%20Transform.%20 fv['ratio_max'] = fv.groupby(['family', 'loc', 'ref'])['ratio'].transform('max') fv['mut_read_mean'] = fv.groupby(['family', 'loc', 'ref'])['mut_reads'].transform('mean') # Delete -1 rows fv = fv.loc[fv['loc'] >= 0] # Filter matrix fv = fv.loc[fv['ratio_max'] >= max_ratio_cutoff][fv['mut_read_mean'] >= mean_read_cutoff] # Add sample discription fv['SampleDes'] = fv['#SampleID'].apply(getSampleDes, d=d) # Draw mutation matrix print("Download tsv here:") dl.csv_download_link(fv, 'family_mut.tsv', delete_prompt=False) if draw_fig: sns.reset_defaults() g = sns.FacetGrid(fv, row="SampleDes", height=1.7, aspect=4) g.map(sns.distplot, 'ratio', kde=False, bins=20) axes = g.axes.flatten() index =0 for ax in axes: #ax.set_title(fv['SampleDes'][index]) #ax.set_xlabel('Mismatch Ratio') ax.set_ylabel('Site Number') index+=1 #g.ax_joint.set(xlabel="Ratio", ylabel="Numbers") plt.xlim(0, 1) plt.figure() g = sns.FacetGrid(fv, row="SampleDes", height=1.7, aspect=4) g.map(sns.distplot, 'loc', kde=False, bins=75) #g.ax_joint.set(xlabel="Mismatch Locations", ylabel="Numbers") axes = g.axes.flatten() index =0 for ax in axes: #ax.set_title(fv['SampleDes'][index]) #ax.set_xlabel('Mutation Locations') ax.set_ylabel('Site Number') index+=1 plt.xlim(0, 75) plt.show() return fv
def save_metrics(self, output_directory): results = pd.DataFrame(self.results) n_classes = len(results['truths']) ## compute confusion matrix y_true_flat, y_pred_flat = [], [] for i in range(n_classes): y_true_flat.extend(results['truths'][i]) y_pred_flat.extend(results['predictions'][i]) cm = confusion_matrix( y_true=y_true_flat, y_pred=y_pred_flat, ) cm = cm / cm.sum(axis=1, keepdims=True) df_cm = pd.DataFrame(cm, range(cm.shape[0]), range(cm.shape[1])) to_list = lambda mapping: list(mapping[i] for i in range(len(mapping))) truths = np.array(to_list(results['truths'])) predictions = np.array(to_list(results['predictions'])) scores = np.array(to_list(results['scores'])) ## truths, predictions, scores are mappings ## truths : class_label -> class_label, ## predictions : class_label -> prediction, ## scores : class_label -> score, ## each size is (n_classes, n_samples) average_precisions, precisions, recalls = [], [], [] roc_aucs, fprs, tprs = [], [], [] for class_truths, class_predictions, class_scores in zip( truths, predictions, scores): n_samples = len(class_predictions) scores_mat = np.zeros((n_samples, n_classes)) truths_mat = np.zeros_like(scores_mat) ## one hot encoding, fill with scores and label scores_mat[np.arange(n_samples), class_predictions] = class_scores truths_mat[np.arange(n_samples), class_truths] = 1 average_precisions.append( average_precision_score(truths_mat.flatten(), scores_mat.flatten())) precision, recall, _ = precision_recall_curve( truths_mat.flatten(), scores_mat.flatten(), ) precisions.append(precision) recalls.append(recall) roc_aucs.append( roc_auc_score(truths_mat.flatten(), scores_mat.flatten())) fpr, tpr, _ = roc_curve(truths_mat.flatten(), scores_mat.flatten()) fprs.append(fpr) tprs.append(tpr) assets = {} ## plot confusion matrix plt.clf() plt.cla() plt.gcf().set_size_inches((6.4, 4.8)) ax = plt.gca() sn.set(font_scale=1.4) # for label size sn.heatmap( df_cm, annot=True, ax=ax, annot_kws={"size": 10} # font size ) plt.autoscale() plt.tight_layout() filename = self.output_directory / '{}_{}.png'.format( self.experiment_name, self.predictor_name) plt.savefig(filename) sn.reset_defaults() assets.update({ 'Confusion Matrix': filename, }) ## plot pr curve plt.clf() plt.cla() ax = plt.gca() plt.gcf().set_size_inches((6.4, 4.8)) lines, labels = [], [] colors = cycle( ['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal']) for i, (precision, recall, ap, color) in enumerate( zip(precisions, recalls, average_precisions, colors)): l, = ax.plot(recall, precision, color=color) class_name = 'class_{}'.format( i) if self.class_names is None else self.class_names[i] label = '{} (ap :{:.2f}'.format(class_name, ap) lines.append(l) labels.append(label) plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.grid() plt.xlabel('recall') plt.ylabel('precision') plt.title("Precision Recall Curve") plt.legend(lines, labels, loc='center left', prop=dict(size=8), bbox_to_anchor=(1., 0.5)) plt.autoscale() plt.tight_layout() filename = self.output_directory / '{}_{}_pr_curve.png'.format( self.experiment_name, self.predictor_name) plt.savefig(filename) assets.update({ 'Precision Recall': filename, }) ## plot roc auc curve plt.clf() plt.cla() ax = plt.gca() plt.gcf().set_size_inches((6.4, 4.8)) lines, labels = [], [] for i, (fpr, tpr, auc, color) in enumerate(zip(fprs, tprs, roc_aucs, colors)): l, = ax.plot(fpr, tpr, color=color) class_name = 'class_{}'.format( i) if self.class_names is None else self.class_names[i] label = '{} (auc :{:.2f})'.format(class_name, auc) lines.append(l) labels.append(label) plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(lines, labels, loc='center left', prop=dict(size=8), bbox_to_anchor=(1., 0.5)) plt.autoscale() plt.tight_layout() filename = self.output_directory / '{}_{}_roc_curve.png'.format( self.experiment_name, self.predictor_name) plt.savefig(filename) assets.update({ 'ROC Curve': filename, }) return assets