def mttkrp_performance_analysis(backend, modes, threads, gpu=False): fig, ax = plt.subplots(len(modes), 1, sharex='all') fig.set_size_inches(w=fig_size_in['width'], h=5) if gpu: mfps = GPU_FPS else: mfps = CPU_FPS[str(threads)] mttkrp_configurations = [ MttkrpMethod.MTTKRP, MttkrpMethod.TWOSTEP0, MttkrpMethod.TWOSTEP1 ] for midx, mode in enumerate(modes): for m in mttkrp_configurations: dic = read_bench(backend, threads, modes, m, gpu) df = dic['data'] dm = df.loc[df['MODE'] == midx] y = dm['FLOPS'] / dm['TIME'] / mfps label, color = get_label_and_color(dic['mttkrp_method'], midx, modes) ax[midx].plot(dm['RANK'], y, '-o', label=label, c=color, markersize=markersize, linewidth=linewidth) # ax[midx].set_xticks(np.arange(dm['RANK'].min(), dm['RANK'].max(), step=49)) ax[midx].set_ylim([0, 1]) ax[midx].set_yticks(np.arange(0, 1.1, step=0.2)) ax[midx].set_ylabel('Efficiency') ax[midx].grid(True) ax[midx].set_xscale('log') ax[midx].set_title('Mode {}'.format(midx)) ax[midx].tick_params(labelright=True) ax[midx].legend(loc='upper left') ax[-1].set_xlabel('Components') ax[-1].set_xticks([1, 10, 100, 1000]) ax[-1].set_xticklabels(['1', '10', '100', '1000']) fig.suptitle('CALS Implementations (BLAS: ' + backend + ', Threads: ' + str(threads) + ', Modes: ' + modes_title_string(modes) + ')') plt.tight_layout() fig.savefig(plot_output_path + 'MTTKRP_Perf_' + backend + '_threads_' + str(threads) + '_modes_' + modes_string(modes) + fig_format) return fig
def speedup_plot(backend, threads, modes, ax, count): x = np.arange(1, 21, 1) y = [] yc = [] for r in range(1, 21): dic = read_data(backend, threads, modes, 'speedup_{}'.format(r)) als = dic['alsdata'] cals = dic['calsdata'] cals_cuda = dic['calscudadata'] tcals = cals['ITERATION'].sum() logger.info("Total CALS time: {}".format(tcals)) ttime, _, _, _, _ = extract_als_data(als) tals = np.sum(ttime) logger.info("Total ALS time: {}".format(tals)) y.append(tals / tcals) if threads == 24 and isinstance(cals_cuda, pd.DataFrame): tccals = cals_cuda['TOTAL'].max() yc.append(tals / tccals) if threads == 1: label = '1 thread' else: label = '{} threads'.format(threads) yticks = [1] if not yc: yticks.extend(list(np.arange(3, np.max(np.array(y)) + 3, 2))) else: yticks.extend(list(np.arange(3, np.max(np.array(yc)) + 3, 2))) yticks = np.array(yticks) if not yc: max_y = np.max(np.array(ax.get_yticks())) if np.max(np.array(yticks)) > max_y: ax.set_yticks(yticks) ax.set_yticklabels([str(i) for i in list(yticks)]) ax.set_ylim([0, np.max(np.array(y)) + 0.1 * np.max(np.array(y))]) ax.set_ylabel('Speedup') else: ax.set_yscale('log') yticks = [1, 10, 100] ax.set_yticks(yticks) ax.set_yticklabels([str(i) for i in yticks]) ax.set_ylim([0.9, 110]) ax.set_ylabel('Speedup') if modes == (300, 300, 300): xticks = [1, 5, 10, 15, 20] ax.set_xticks(xticks) ax.set_xticklabels([str(i) for i in list(xticks)]) ax.set_xlabel('Components') ax.set_title(modes_title_string(dic['modes'])) ax.grid(b=True, which='both', axis='y') ax.plot(x, y, '-o', color=colors[count], label=label, markersize=markersize, linewidth=linewidth) if yc: ax.plot(x, yc, '-o', color='C2', label='CUDA', markersize=markersize, linewidth=linewidth) if modes == (100, 100, 100): ax.legend(ncol=2) return ax
def performance_plot_both(dic, ax=None, print_all=False): als_df = dic['alsdata'] als_omp = dic['alsompdata'] cals_df = dic['calsdata'] ccals_df = dic['ccalsdata'] ttb_l = dic['ttbdata'] if ax is None: fig, ax = plt.subplots(1, 1) fig.set_size_inches(w=4.68596, h=3.5) threads = str(cals_df['NUM_THREADS'][0]) mfps = CPU_FPS[threads] gemm = GEMM[modes_string(dic['modes'])][str(threads)] ttime, ctime, flops, ranks, ittime = extract_als_data(als_df) flop_cumsum_als = np.cumsum(flops) ttime_omp, ctime_omp, flops_omp, ranks_omp, ittime_omp = extract_als_data( als_omp) flop_cumsum_als_omp = np.cumsum(flops_omp) flop_cumsum_cals = cals_df['FLOPS'].cumsum() print() print( 'CALS Flops: {:>14}, Total: {:>8.2f}, Iteration sum: {:>8.2f}'.format( list(flop_cumsum_cals)[-1], cals_df['ITERATION'].sum(), cals_df['TOTAL'].max())) print('OALS Flops: {:>14}, Total: {:>8.2f}'.format( list(flop_cumsum_als_omp)[-1], ttime_omp.max())) print( ' ALS Flops: {:>14}, Total: {:>8.2f}, Iteration sum: {:>8.2f}'.format( list(flop_cumsum_als)[-1], ittime.sum(), ttime.sum())) print() ax.step(flop_cumsum_cals, cals_df['FLOPS'] / cals_df['ITERATION'] / mfps, '-', label='CALS', color='C0', markersize=markersize, linewidth=linewidth) print('{} {} {} {}'.format(flops_omp[-1], ttime_omp.max(), mfps, flops_omp[-1] / ttime_omp.max() / mfps)) if threads != '1': val = flop_cumsum_als_omp[-1] / ttime_omp.max() / mfps ax.step([flop_cumsum_als_omp[0], flop_cumsum_als_omp[-1]], [val, val], '-', label='OMP ALS', color='C6', markersize=markersize, linewidth=linewidth) ax.step(flop_cumsum_als, flops / ttime / mfps, '-', label='ALS', color='C1', markersize=markersize, linewidth=linewidth) if ttb_l: ax.step(flop_cumsum_als, flops / np.array(ttb_l) / mfps, '-', label='TTB', color='C4', markersize=markersize, linewidth=linewidth) plot_gemm(gemm, ax, flop_cumsum_als) # Plot the CALS buffer size as xticks # xticks = np.arange(1, cals_df['COLS'].count(), step=3) # plt.xticks(ticks=xticks, labels=np.array(cals_df['COLS'])[xticks - 1], rotation=45, fontsize=3) # Plot the ALS ranks as xticks # xticks = np.arange(1, len(ranks), step=1) # plt.xticks(ticks=xticks, labels=ranks[xticks - 1], rotation=45, fontsize=3) # Plot total distance as xticks flop_cumsum_cals = np.array(flop_cumsum_cals) ax.set_xticks([ 0, 0.33 * flop_cumsum_cals[-1], 0.66 * flop_cumsum_cals[-1], flop_cumsum_cals[-1] ]) if threads == '24' or print_all: ax.set_xticklabels(['0', '.33', '.66', '1']) # xticks = np.arange(1, len(ranks), step=1) # plt.xticks(ticks=xticks, labels=ranks[xticks - 1], rotation=45, fontsize=3) # if (dic['modes'] == (200, 200, 200)) and (threads == '1'): plot_x_ranks(ax, als_df) if ((dic['modes'] == (100, 100, 100) or dic['modes'] == (299, 301, 41)) and (threads == '12')) or print_all: ax.legend() if ((dic['modes'] == (100, 100, 100)) or (dic['modes'] == (100, 100, 100) and threads == '1')) or print_all: ax.set_ylabel('Efficiency (Threads: {})'.format(threads)) else: ax.tick_params(labelleft=False, left=True) if dic['modes'] == (299, 301, 41): ax.set_ylabel('Efficiency (Threads: {})'.format(threads)) ax.tick_params(labelleft=True, left=True) else: if threads == '24': ax.set_xlabel('Total computation') # if threads_on_title: # ax.set_title('Threads: {}'.format(threads)) # else: # ax.set_title(mode_string_title(dic['modes'])) if threads == "1" or print_all: ax.set_title(modes_title_string(dic['modes'])) ax.set_xlim([ -0.02 * flop_cumsum_cals[-1], flop_cumsum_cals[-1] + 0.02 * flop_cumsum_cals[-1] ]) ax.set_ylim([0, 1]) ax.set_yticks(ticks=np.arange(0, 1.1, step=0.1)) ax.grid(True, axis='y') # plt.tight_layout() if ax is None: plt.savefig(plot_output_path + 'ALS_v_CALS_' + dic['backend'] + '_modes_' + modes_string(dic['modes']) + '_threads_' + str(dic['threads']) + fig_format)
def plot_best_mttkrp(backend, modes, threads, best, ax=None, c=None, f_ylabel=True, f_xlabel=True, f_legend=True, gpu=False): mfps = CPU_FPS[str(threads)] if gpu: mfps = 7e12 # Accumulate the FLOPS and the TIME per mode, in order to calculate the efficiency of MTTKRP for all modes x = np.array(0) y_time = np.array(0) y_flops = np.array(0) for idx, data in enumerate(best): if idx == 0: x = np.array([rank for rank, flops, time, method in data]) y_time = np.array([time for rank, flops, time, method in data]) y_flops = np.array([flops for rank, flops, time, method in data]) else: y_time += np.array([time for rank, flops, time, method in data]) y_flops += np.array([flops for rank, flops, time, method in data]) y = y_flops / y_time / mfps # Remove rank values from 11 to 19 (to conform to log scale) x, y = list(x), list(y) for rank in range(11, 20): if rank in x: ind = x.index(rank) x.pop(ind) y.pop(ind) x, y = np.array(x), np.array(y) if threads == 1: label = '1 thread' else: label = '{} threads'.format(threads) fig = None if ax is None: fig, ax = plt.subplots(1, 1) ax.plot(x, y, '-o', color=c, label=label, markersize=markersize, linewidth=linewidth) ################### # Figure formatting ################### # y axis ax.set_ylim([0, 1]) ax.set_yticks(np.arange(0, 1.1, step=0.2)) if f_ylabel: ax.set_ylabel('Efficiency') # x axis # if modes == (300, 300, 300) or modes == (299, 301, 41): ax.set_xscale('log') ax.set_xticks([1, 10, 100, 1000]) if f_xlabel: ax.set_xticklabels([str(i) for i in [1, 10, 100, 1000]]) ax.set_xlabel('Components') # rest ax.tick_params(labelleft=True, left=True) ax.grid(b=True, axis='y') if f_legend: ax.legend(loc='upper left') ax.set_title(modes_title_string(modes)) if fig: fig.savefig(plot_output_path + 'MTTKRP_Best_Benchmark_' + backend + '_threads_' + str(threads) + '_modes_' + modes_string(modes) + fig_format) return ax
old_lim[1] += 0.1 * old_lim[1] ax[i].set_ylim(old_lim) for p in ax[i].patches: if p.get_height() < 5: height = round(p.get_height(), 1) else: height = int(round(p.get_height())) if not p.get_height() == 0: ax[i].annotate( str(height), xy=(p.get_x() + p.get_width() / 2, height), xytext=(0, 2), # 3 points vertical offset textcoords="offset points", ha='center', va='bottom') ax[i].set_title(modes_title_string(modes)) ax[i].set_ylabel('Time in seconds') title = 'Single Threaded Execution' if n_threads == 24: if gpu: title = 'Offloading of the MTTKRP to the GPU' else: title = 'Multi Threaded Execution (24 threads)' fig.suptitle(title) ax[-1].set_xlabel('Components') fig.tight_layout() specifier = str(n_threads) if gpu: specifier = 'GPU' plt.savefig(plot_output_path + 'artif_' + specifier + fig_format)
if th == 24 and isinstance(als_omp_cuda, pd.DataFrame): ttime_omp_cu, _, _, _, _ = extract_als_data(als_omp_cuda) df.at['CUDA', 'OMP ALS'] = np.max(ttime_omp_cu) if th == 24 and isinstance(cals_cuda, pd.DataFrame): print('CUDA Iteration - CUDA Total: {:0.3f}'.format( cals_cuda['ITERATION'].sum() - cals_cuda['TOTAL'].max())) df.at['CUDA', 'CALS'] = cals_cuda['ITERATION'].sum() index = ['1 thread', '24 threads', 'CUDA'] df.index = index print(df.to_latex(float_format="{:0.2f}".format, na_rep='-')) df.plot.bar(ax=ax, color=['C4', 'C1', 'C6', 'C0', 'C3'], rot=0) ax.set_title(modes_title_string(modes)) ax.set_ylabel('Time in seconds') # ax.set_yscale('log') # yticks = [1, 10, 100, 1000, 10000] # ax.set_yticks(yticks) # ax.set_yticklabels([str(i) for i in yticks]) # ax.set_ylim([0.9, 110]) old_lim = list(ax.get_ylim()) old_lim[1] += 0.05 * old_lim[1] ax.set_ylim(old_lim) for p in ax.patches: height = int(round(p.get_height())) if not p.get_height() == 0: ax.annotate( str(height),
cals = dic['calsdata'] ttb = dic['ttbdata'] cuda = dic['calscudadata'] # results['ALS'].extend([als['TOTAL'].sum()]) # results['OMP ALS'].extend([alsomp['TOTAL'].max()]) # results['CALS'].extend([cals['ITERATION'].sum()]) results['ALS CUDA'].extend([alscuda['TOTAL'].sum()]) results['OMP ALS CUDA'].extend([alsompcuda['TOTAL'].max()]) if th == 24 and isinstance(cuda, pd.DataFrame): print('CUDA Iteration - CUDA Total: {:0.3f}'.format( cuda['ITERATION'].sum() - cuda['TOTAL'].max())) results['CALS CUDA'].extend([cuda['TOTAL'].max()]) index = [modes_title_string(i) for i in modes_list] df = pd.DataFrame(results, index=index) # df.plot.bar(ax=ax, color=['C1', 'C6', 'C0', '#5fd35f', '#2ca02c', '#165016'], rot=0) df.plot.bar(ax=ax, color=['#5fd35f', '#2ca02c', '#165016'], rot=0) ax.set_ylabel('Time in seconds') old_lim = list(ax.get_ylim()) old_lim[1] += 0.05 * old_lim[1] ax.set_ylim(old_lim) for p in ax.patches: height = round(p.get_height(), 1) if not p.get_height() == 0: ax.annotate( str(height), xy=(p.get_x() + p.get_width() / 2, height), xytext=(0, 1), # 3 points vertical offset textcoords="offset points",