예제 #1
0
def mttkrp_performance_analysis(backend, modes, threads, gpu=False):
    fig, ax = plt.subplots(len(modes), 1, sharex='all')
    fig.set_size_inches(w=fig_size_in['width'], h=5)

    if gpu:
        mfps = GPU_FPS
    else:
        mfps = CPU_FPS[str(threads)]
    mttkrp_configurations = [
        MttkrpMethod.MTTKRP, MttkrpMethod.TWOSTEP0, MttkrpMethod.TWOSTEP1
    ]

    for midx, mode in enumerate(modes):
        for m in mttkrp_configurations:
            dic = read_bench(backend, threads, modes, m, gpu)
            df = dic['data']
            dm = df.loc[df['MODE'] == midx]
            y = dm['FLOPS'] / dm['TIME'] / mfps
            label, color = get_label_and_color(dic['mttkrp_method'], midx,
                                               modes)
            ax[midx].plot(dm['RANK'],
                          y,
                          '-o',
                          label=label,
                          c=color,
                          markersize=markersize,
                          linewidth=linewidth)
            # ax[midx].set_xticks(np.arange(dm['RANK'].min(), dm['RANK'].max(), step=49))
            ax[midx].set_ylim([0, 1])
            ax[midx].set_yticks(np.arange(0, 1.1, step=0.2))
            ax[midx].set_ylabel('Efficiency')
            ax[midx].grid(True)
            ax[midx].set_xscale('log')
            ax[midx].set_title('Mode {}'.format(midx))
            ax[midx].tick_params(labelright=True)
            ax[midx].legend(loc='upper left')
    ax[-1].set_xlabel('Components')
    ax[-1].set_xticks([1, 10, 100, 1000])
    ax[-1].set_xticklabels(['1', '10', '100', '1000'])
    fig.suptitle('CALS Implementations (BLAS: ' + backend + ', Threads: ' +
                 str(threads) + ', Modes: ' + modes_title_string(modes) + ')')

    plt.tight_layout()
    fig.savefig(plot_output_path + 'MTTKRP_Perf_' + backend + '_threads_' +
                str(threads) + '_modes_' + modes_string(modes) + fig_format)
    return fig
예제 #2
0
def speedup_plot(backend, threads, modes, ax, count):
    x = np.arange(1, 21, 1)
    y = []
    yc = []
    for r in range(1, 21):
        dic = read_data(backend, threads, modes, 'speedup_{}'.format(r))

        als = dic['alsdata']
        cals = dic['calsdata']
        cals_cuda = dic['calscudadata']

        tcals = cals['ITERATION'].sum()
        logger.info("Total CALS time: {}".format(tcals))

        ttime, _, _, _, _ = extract_als_data(als)
        tals = np.sum(ttime)
        logger.info("Total ALS time: {}".format(tals))

        y.append(tals / tcals)

        if threads == 24 and isinstance(cals_cuda, pd.DataFrame):
            tccals = cals_cuda['TOTAL'].max()
            yc.append(tals / tccals)

    if threads == 1:
        label = '1 thread'
    else:
        label = '{} threads'.format(threads)

    yticks = [1]
    if not yc:
        yticks.extend(list(np.arange(3, np.max(np.array(y)) + 3, 2)))
    else:
        yticks.extend(list(np.arange(3, np.max(np.array(yc)) + 3, 2)))
    yticks = np.array(yticks)

    if not yc:
        max_y = np.max(np.array(ax.get_yticks()))
        if np.max(np.array(yticks)) > max_y:
            ax.set_yticks(yticks)
            ax.set_yticklabels([str(i) for i in list(yticks)])
            ax.set_ylim([0, np.max(np.array(y)) + 0.1 * np.max(np.array(y))])
            ax.set_ylabel('Speedup')
    else:
        ax.set_yscale('log')
        yticks = [1, 10, 100]
        ax.set_yticks(yticks)
        ax.set_yticklabels([str(i) for i in yticks])
        ax.set_ylim([0.9, 110])
        ax.set_ylabel('Speedup')

    if modes == (300, 300, 300):
        xticks = [1, 5, 10, 15, 20]
        ax.set_xticks(xticks)
        ax.set_xticklabels([str(i) for i in list(xticks)])
        ax.set_xlabel('Components')

    ax.set_title(modes_title_string(dic['modes']))

    ax.grid(b=True, which='both', axis='y')
    ax.plot(x,
            y,
            '-o',
            color=colors[count],
            label=label,
            markersize=markersize,
            linewidth=linewidth)
    if yc:
        ax.plot(x,
                yc,
                '-o',
                color='C2',
                label='CUDA',
                markersize=markersize,
                linewidth=linewidth)

    if modes == (100, 100, 100):
        ax.legend(ncol=2)

    return ax
예제 #3
0
def performance_plot_both(dic, ax=None, print_all=False):
    als_df = dic['alsdata']
    als_omp = dic['alsompdata']
    cals_df = dic['calsdata']
    ccals_df = dic['ccalsdata']
    ttb_l = dic['ttbdata']

    if ax is None:
        fig, ax = plt.subplots(1, 1)
        fig.set_size_inches(w=4.68596, h=3.5)

    threads = str(cals_df['NUM_THREADS'][0])

    mfps = CPU_FPS[threads]
    gemm = GEMM[modes_string(dic['modes'])][str(threads)]

    ttime, ctime, flops, ranks, ittime = extract_als_data(als_df)
    flop_cumsum_als = np.cumsum(flops)

    ttime_omp, ctime_omp, flops_omp, ranks_omp, ittime_omp = extract_als_data(
        als_omp)
    flop_cumsum_als_omp = np.cumsum(flops_omp)

    flop_cumsum_cals = cals_df['FLOPS'].cumsum()

    print()
    print(
        'CALS Flops: {:>14}, Total: {:>8.2f}, Iteration sum: {:>8.2f}'.format(
            list(flop_cumsum_cals)[-1], cals_df['ITERATION'].sum(),
            cals_df['TOTAL'].max()))
    print('OALS Flops: {:>14}, Total: {:>8.2f}'.format(
        list(flop_cumsum_als_omp)[-1], ttime_omp.max()))
    print(
        ' ALS Flops: {:>14}, Total: {:>8.2f}, Iteration sum: {:>8.2f}'.format(
            list(flop_cumsum_als)[-1], ittime.sum(), ttime.sum()))
    print()

    ax.step(flop_cumsum_cals,
            cals_df['FLOPS'] / cals_df['ITERATION'] / mfps,
            '-',
            label='CALS',
            color='C0',
            markersize=markersize,
            linewidth=linewidth)

    print('{} {} {} {}'.format(flops_omp[-1], ttime_omp.max(), mfps,
                               flops_omp[-1] / ttime_omp.max() / mfps))

    if threads != '1':
        val = flop_cumsum_als_omp[-1] / ttime_omp.max() / mfps
        ax.step([flop_cumsum_als_omp[0], flop_cumsum_als_omp[-1]], [val, val],
                '-',
                label='OMP ALS',
                color='C6',
                markersize=markersize,
                linewidth=linewidth)

    ax.step(flop_cumsum_als,
            flops / ttime / mfps,
            '-',
            label='ALS',
            color='C1',
            markersize=markersize,
            linewidth=linewidth)

    if ttb_l:
        ax.step(flop_cumsum_als,
                flops / np.array(ttb_l) / mfps,
                '-',
                label='TTB',
                color='C4',
                markersize=markersize,
                linewidth=linewidth)

    plot_gemm(gemm, ax, flop_cumsum_als)

    # Plot the CALS buffer size as xticks
    # xticks = np.arange(1, cals_df['COLS'].count(), step=3)
    # plt.xticks(ticks=xticks, labels=np.array(cals_df['COLS'])[xticks - 1], rotation=45, fontsize=3)

    # Plot the ALS ranks as xticks
    # xticks = np.arange(1, len(ranks), step=1)
    # plt.xticks(ticks=xticks, labels=ranks[xticks - 1], rotation=45, fontsize=3)

    # Plot total distance as xticks
    flop_cumsum_cals = np.array(flop_cumsum_cals)
    ax.set_xticks([
        0, 0.33 * flop_cumsum_cals[-1], 0.66 * flop_cumsum_cals[-1],
        flop_cumsum_cals[-1]
    ])

    if threads == '24' or print_all:
        ax.set_xticklabels(['0', '.33', '.66', '1'])
    # xticks = np.arange(1, len(ranks), step=1)
    # plt.xticks(ticks=xticks, labels=ranks[xticks - 1], rotation=45, fontsize=3)

    # if (dic['modes'] == (200, 200, 200)) and (threads == '1'):
    plot_x_ranks(ax, als_df)

    if ((dic['modes'] == (100, 100, 100) or dic['modes'] == (299, 301, 41)) and
        (threads == '12')) or print_all:
        ax.legend()

    if ((dic['modes'] == (100, 100, 100)) or
        (dic['modes'] == (100, 100, 100) and threads == '1')) or print_all:
        ax.set_ylabel('Efficiency (Threads: {})'.format(threads))
    else:
        ax.tick_params(labelleft=False, left=True)

    if dic['modes'] == (299, 301, 41):
        ax.set_ylabel('Efficiency (Threads: {})'.format(threads))
        ax.tick_params(labelleft=True, left=True)
    else:
        if threads == '24':
            ax.set_xlabel('Total computation')

    # if threads_on_title:
    #     ax.set_title('Threads: {}'.format(threads))
    # else:
    #     ax.set_title(mode_string_title(dic['modes']))

    if threads == "1" or print_all:
        ax.set_title(modes_title_string(dic['modes']))

    ax.set_xlim([
        -0.02 * flop_cumsum_cals[-1],
        flop_cumsum_cals[-1] + 0.02 * flop_cumsum_cals[-1]
    ])

    ax.set_ylim([0, 1])
    ax.set_yticks(ticks=np.arange(0, 1.1, step=0.1))

    ax.grid(True, axis='y')
    # plt.tight_layout()
    if ax is None:
        plt.savefig(plot_output_path + 'ALS_v_CALS_' + dic['backend'] +
                    '_modes_' + modes_string(dic['modes']) + '_threads_' +
                    str(dic['threads']) + fig_format)
예제 #4
0
def plot_best_mttkrp(backend,
                     modes,
                     threads,
                     best,
                     ax=None,
                     c=None,
                     f_ylabel=True,
                     f_xlabel=True,
                     f_legend=True,
                     gpu=False):

    mfps = CPU_FPS[str(threads)]
    if gpu:
        mfps = 7e12

    # Accumulate the FLOPS and the TIME per mode, in order to calculate the efficiency of MTTKRP for all modes
    x = np.array(0)
    y_time = np.array(0)
    y_flops = np.array(0)
    for idx, data in enumerate(best):
        if idx == 0:
            x = np.array([rank for rank, flops, time, method in data])
            y_time = np.array([time for rank, flops, time, method in data])
            y_flops = np.array([flops for rank, flops, time, method in data])
        else:
            y_time += np.array([time for rank, flops, time, method in data])
            y_flops += np.array([flops for rank, flops, time, method in data])
    y = y_flops / y_time / mfps

    # Remove rank values from 11 to 19 (to conform to log scale)
    x, y = list(x), list(y)
    for rank in range(11, 20):
        if rank in x:
            ind = x.index(rank)
            x.pop(ind)
            y.pop(ind)
    x, y = np.array(x), np.array(y)

    if threads == 1:
        label = '1 thread'
    else:
        label = '{} threads'.format(threads)

    fig = None
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.plot(x,
            y,
            '-o',
            color=c,
            label=label,
            markersize=markersize,
            linewidth=linewidth)

    ###################
    # Figure formatting
    ###################

    # y axis
    ax.set_ylim([0, 1])
    ax.set_yticks(np.arange(0, 1.1, step=0.2))
    if f_ylabel:
        ax.set_ylabel('Efficiency')

    # x axis
    # if modes == (300, 300, 300) or modes == (299, 301, 41):
    ax.set_xscale('log')
    ax.set_xticks([1, 10, 100, 1000])

    if f_xlabel:
        ax.set_xticklabels([str(i) for i in [1, 10, 100, 1000]])
        ax.set_xlabel('Components')

    # rest
    ax.tick_params(labelleft=True, left=True)
    ax.grid(b=True, axis='y')

    if f_legend:
        ax.legend(loc='upper left')

    ax.set_title(modes_title_string(modes))
    if fig:
        fig.savefig(plot_output_path + 'MTTKRP_Best_Benchmark_' + backend +
                    '_threads_' + str(threads) + '_modes_' +
                    modes_string(modes) + fig_format)
    return ax
예제 #5
0
            old_lim[1] += 0.1 * old_lim[1]
            ax[i].set_ylim(old_lim)
            for p in ax[i].patches:
                if p.get_height() < 5:
                    height = round(p.get_height(), 1)
                else:
                    height = int(round(p.get_height()))
                if not p.get_height() == 0:
                    ax[i].annotate(
                        str(height),
                        xy=(p.get_x() + p.get_width() / 2, height),
                        xytext=(0, 2),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center',
                        va='bottom')
            ax[i].set_title(modes_title_string(modes))
            ax[i].set_ylabel('Time in seconds')
        title = 'Single Threaded Execution'
        if n_threads == 24:
            if gpu:
                title = 'Offloading of the MTTKRP to the GPU'
            else:
                title = 'Multi Threaded Execution (24 threads)'
        fig.suptitle(title)
        ax[-1].set_xlabel('Components')
        fig.tight_layout()
        specifier = str(n_threads)
        if gpu:
            specifier = 'GPU'
        plt.savefig(plot_output_path + 'artif_' + specifier + fig_format)
예제 #6
0
            if th == 24 and isinstance(als_omp_cuda, pd.DataFrame):
                ttime_omp_cu, _, _, _, _ = extract_als_data(als_omp_cuda)
                df.at['CUDA', 'OMP ALS'] = np.max(ttime_omp_cu)

            if th == 24 and isinstance(cals_cuda, pd.DataFrame):
                print('CUDA Iteration - CUDA Total: {:0.3f}'.format(
                    cals_cuda['ITERATION'].sum() - cals_cuda['TOTAL'].max()))
                df.at['CUDA', 'CALS'] = cals_cuda['ITERATION'].sum()

        index = ['1 thread', '24 threads', 'CUDA']
        df.index = index

        print(df.to_latex(float_format="{:0.2f}".format, na_rep='-'))
        df.plot.bar(ax=ax, color=['C4', 'C1', 'C6', 'C0', 'C3'], rot=0)
        ax.set_title(modes_title_string(modes))
        ax.set_ylabel('Time in seconds')
        # ax.set_yscale('log')
        # yticks = [1, 10, 100, 1000, 10000]
        # ax.set_yticks(yticks)
        # ax.set_yticklabels([str(i) for i in yticks])
        # ax.set_ylim([0.9, 110])

        old_lim = list(ax.get_ylim())
        old_lim[1] += 0.05 * old_lim[1]
        ax.set_ylim(old_lim)
        for p in ax.patches:
            height = int(round(p.get_height()))
            if not p.get_height() == 0:
                ax.annotate(
                    str(height),
예제 #7
0
            cals = dic['calsdata']
            ttb = dic['ttbdata']
            cuda = dic['calscudadata']

            # results['ALS'].extend([als['TOTAL'].sum()])
            # results['OMP ALS'].extend([alsomp['TOTAL'].max()])
            # results['CALS'].extend([cals['ITERATION'].sum()])

            results['ALS CUDA'].extend([alscuda['TOTAL'].sum()])
            results['OMP ALS CUDA'].extend([alsompcuda['TOTAL'].max()])
            if th == 24 and isinstance(cuda, pd.DataFrame):
                print('CUDA Iteration - CUDA Total: {:0.3f}'.format(
                    cuda['ITERATION'].sum() - cuda['TOTAL'].max()))
                results['CALS CUDA'].extend([cuda['TOTAL'].max()])

    index = [modes_title_string(i) for i in modes_list]
    df = pd.DataFrame(results, index=index)
    # df.plot.bar(ax=ax, color=['C1', 'C6', 'C0', '#5fd35f', '#2ca02c', '#165016'], rot=0)
    df.plot.bar(ax=ax, color=['#5fd35f', '#2ca02c', '#165016'], rot=0)
    ax.set_ylabel('Time in seconds')
    old_lim = list(ax.get_ylim())
    old_lim[1] += 0.05 * old_lim[1]
    ax.set_ylim(old_lim)
    for p in ax.patches:
        height = round(p.get_height(), 1)
        if not p.get_height() == 0:
            ax.annotate(
                str(height),
                xy=(p.get_x() + p.get_width() / 2, height),
                xytext=(0, 1),  # 3 points vertical offset
                textcoords="offset points",