예제 #1
0
def update(meta = None):
    if not meta: return

    #Job.objects.all().delete()

    # Only need to populate lariat cache once
    jobid = meta.json.keys()[0]

    ld = lariat_utils.LariatData(jobid,
                                 end_epoch = meta.json[jobid]['end_epoch'],
                                 directory = sys_path_append.lariat_path,
                                 daysback = 2)
        
    for jobid, json in meta.json.iteritems():

        if Job.objects.filter(id = jobid).exists(): continue  
        ld = lariat_utils.LariatData(jobid,
                                     olddata = ld.ld)
        json['user'] = ld.user
        json['exe'] = ld.exc.split('/')[-1]
        json['cwd'] = ld.cwd
        json['run_time'] = meta.json[jobid]['end_epoch'] - meta.json[jobid]['start_epoch']
        json['threads'] = ld.threads
        try:
            job_model, created = Job.objects.get_or_create(**json) 
        except:
            print "Something wrong with json",json
    return 
예제 #2
0
def ls4_update(meta=None):
    if not meta: return

    #LS4Job.objects.all().delete()

    # Only need to populate lariat cache once
    jobid = meta.json.keys()[0]

    ld = lariat_utils.LariatData(jobid,
                                 end_epoch=meta.json[jobid]['end_epoch'],
                                 directory=sys_path_append.lariat_path,
                                 daysback=2)

    for jobid, json in meta.json.iteritems():

        if LS4Job.objects.filter(id=jobid).exists(): continue
        ld = lariat_utils.LariatData(jobid, olddata=ld.ld)

        if json['exit_status'] != 0: json['status'] = 'TIMEOUT/CANCELLED'
        else: json['status'] = 'COMPLETED'
        if json['failed'] != 0: json['status'] = 'FAILED'

        json['nodes'] = str(int(json['slots']) / 12)
        json['cores'] = str(
            int(json['granted_pe'].rstrip('way')) * int(json['nodes']))
        json['run_time'] = meta.json[jobid]['end_epoch'] - meta.json[jobid][
            'start_epoch']

        jsondb = {}
        jsondb['id'] = json['id']
        jsondb['project'] = json['account']
        jsondb['start_time'] = json['start_time']
        jsondb['end_time'] = json['end_time']
        jsondb['start_epoch'] = json['start_epoch']
        jsondb['end_epoch'] = json['end_epoch']
        jsondb['run_time'] = json['run_time']
        jsondb['queue'] = json['queue']
        jsondb['name'] = json['name']
        jsondb['status'] = json['status']
        jsondb['nodes'] = json['nodes']
        jsondb['cores'] = json['cores']
        jsondb['path'] = json['path']
        jsondb['date'] = json['date']
        jsondb['user'] = json['owner']

        # LD
        jsondb['exe'] = ld.exc.split('/')[-1]
        jsondb['cwd'] = ld.cwd
        jsondb['threads'] = ld.threads

        try:
            job_model, created = LS4Job.objects.get_or_create(**jsondb)
        except:
            print "Something wrong with json", jsondb
    return
예제 #3
0
파일: gen.py 프로젝트: tdrjnr/tacc_stats
 def __init__(self,jobid,k1,k2,aggregate=True,stats=None):
     ## Build ts and ld object for a job  
     
     self.k1=k1
     self.k2=k2
     self.jobid=jobid
     self.aggregate=aggregate
     
     try:
         if self.aggregate:
             self.ts=tspl.TSPLSum(jobid,self.k1,self.k2,job_data=stats)
         else:
             self.ts=tspl.TSPLBase(jobid,self.k1,self.k2,job_data=stats)
             
         if not self.ld:
             self.ld=lariat_utils.LariatData()
         
         self.ld.get_job(self.ts.j.id,
                         end_epoch=self.ts.j.end_time,
                         daysback=3,
                         directory=lariat_path)
         return
     except tspl.TSPLException as e:
         return
     except EOFError as e:
         print 'End of file found reading: ' + jobid
         return
예제 #4
0
def do_check(f, jobs):
    try:
        ts = tspl.TSPLSum(f, ['amd64_core'], ['SSE_FLOPS'])
    except tspl.TSPLException:
        return

    if not tspl_utils.checkjob(ts, 3600, range(1, 33)):  # 1 hour
        return

    ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time,
                                 analyze_conf.lariat_path)
    jobs[ts.j.id] = ld.exc
예제 #5
0
def getcode(file, code, output_dir):
    try:
        ts = tspl.TSPLBase(file, ['lnet'], ['rx_bytes'])
    except tspl.TSPLException as e:
        return

    ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time,
                                 analyze_conf.lariat_path)

    ename = ld.exc.split('/')[-1]
    ename = ld.comp_name(ename, ld.equiv_patterns)

    if ename == code:
        print ts.j.id, ename, ts.wayness
        masterplot.master_plot(file,
                               output_dir=output_dir,
                               mintime=1,
                               wayness=ts.wayness)
예제 #6
0
def plot_ratios(ts, tmid, ratio, ratio2, rate, var, fig, ax, full):
    # Compute y-axis min and max, expand the limits by 10%
    ymin = min(numpy.minimum(ratio, ratio2))
    ymax = max(numpy.maximum(ratio, ratio2))
    ymin, ymax = tspl_utils.expand_range(ymin, ymax, 0.1)

    ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time,
                                 analyze_conf.lariat_path)

    print '---------------------'
    ax[0].plot(tmid / 3600, ratio)
    ax[0].hold = True
    ax[0].plot(tmid / 3600, ratio2)
    ax[0].legend(('Std Dev', 'Max Diff'), loc=4)
    ax[1].hold = True
    ymin1 = 0.  # This is wrong in general, but we don't want the min to be > 0.
    ymax1 = 0.
    for v in rate:
        ymin1 = min(ymin1, min(v))
        ymax1 = max(ymax1, max(v))
        ax[1].plot(tmid / 3600, v)

    ymin1, ymax1 = tspl_utils.expand_range(ymin1, ymax1, 0.1)

    title = ts.title
    if ld.exc != 'unknown':
        title += ', E: ' + ld.exc.split('/')[-1]
    title += ', V: %(V)-8.3g' % {'V': var}
    plt.suptitle(title)
    ax[0].set_xlabel('Time (hr)')
    ax[0].set_ylabel('Imbalance Ratios')
    ax[1].set_xlabel('Time (hr)')
    ax[1].set_ylabel('Total ' + ts.label(ts.k1[0], ts.k2[0]) + '/s')
    ax[0].set_ylim(bottom=ymin, top=ymax)
    ax[1].set_ylim(bottom=ymin1, top=ymax1)

    fname = '_'.join(
        ['graph', ts.j.id, ts.owner, ts.k1[0], ts.k2[0], 'imbalance' + full])
    fig.savefig(fname)
    plt.close()
예제 #7
0
def mem_usage(file):
    try:
        ts = tspl.TSPLSum(file, ['mem'], ['MemUsed'])
    except tspl.TSPLException as e:
        print e
        return []

    ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time,
                                 analyze_conf.lariat_path)
    mem_max = 0.
    for host in ts.j.hosts.keys():
        mem_max = max(numpy.max(ts.data[0][host]), mem_max)

    mem_per_core = mem_max / (1024. * 1024. * 1024. * float(ts.wayness))

    print ts.j.id, ': ', mem_per_core, ts.wayness, ld.threads
    if (int(ts.wayness) * int(ld.threads)) > 16:
        print ts.j.id, 'used more than one thread per core!'

    if (int(ts.wayness)*int(ld.threads)) <= 16 and \
        (int(ts.wayness)*int(ld.threads)) > 0 :
        return [mem_per_core]
    else:
        return []
def compute_imbalance(file, k1, k2, thresh, lariat_dict):
    try:
        ts = tspl.TSPLBase(file, k1, k2)
    except tspl.TSPLException as e:
        return
    except EOFError as e:
        print 'End of file found reading: ' + file
        return

    ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev']
    if not tspl_utils.checkjob(ts, 3600, 16, ignore_qs):  # 1 hour, 16way only
        return
    elif ts.numhosts < 2:  # At least 2 hosts
        print ts.j.id + ': 1 host'
        return

    if lariat_dict == None:
        ld = lariat_utils.LariatData(ts.j.id,
                                     end_epoch=ts.j.end_time,
                                     daysback=3,
                                     directory=analyze_conf.lariat_path)
    else:
        ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict)

    if ld.wayness == -1:
        print 'Unknown wayness: ', ts.j.id
        return
    elif ld.wayness != ts.wayness:
        print 'Lariat and TACC Stats disagree about wayness. Skipping: ', ts.j.id
        return

    tmid = (ts.t[:-1] + ts.t[1:]) / 2.0
    rng = range(1, len(tmid))  # Throw out first and last
    tmid = tmid[rng]

    for h in ts.data[0].keys():
        host_data = ts.data[0][h]
        maxval = numpy.zeros(len(rng))
        minval = numpy.ones(len(rng)) * 1e100
        rate = []
        for v in host_data:
            rate.append(numpy.diff(v)[rng] / numpy.diff(ts.t)[rng])
            maxval = numpy.maximum(maxval, rate[-1])
            minval = numpy.minimum(minval, rate[-1])

        vals = []
        mean = []
        std = []
        for j in range(len(rng)):
            vals.append([])
            for v in rate:
                vals[j].append(v[j])
            mean.append(scipy.stats.tmean(vals[j]))
            std.append(scipy.stats.tstd(vals[j]))

        ratio = numpy.divide(std, mean)

        var = scipy.stats.tmean(ratio)

        if abs(var) > thresh:
            print ts.j.id + ': ' + str(var)
            return file
예제 #9
0
def compute_ratio(file, lariat_dict=None):
    try:
        ts = tspl.TSPLSum(file, [
            'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb',
            'intel_snb', 'intel_snb', 'intel_snb'
        ], [
            'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256',
            'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE'
        ])

    except tspl.TSPLException as e:
        return

    ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev']
    if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs):
        return

    tmid = (ts.t[:-1] + ts.t[1:]) / 2.0

    if lariat_dict == None:
        ld = lariat_utils.LariatData(ts.j.id,
                                     end_epoch=ts.j.end_time,
                                     daysback=3,
                                     directory=analyze_conf.lariat_path)
    else:
        ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict)

    if ld.exc == 'unknown' or ld.wayness != ts.wayness:  # try loading older lariat
        ld = lariat_utils.LariatData(ts.j.id,
                                     end_epoch=ts.j.end_time,
                                     daysback=3,
                                     directory=analyze_conf.lariat_path,
                                     olddata=ld.ld)
    if ld.exc == 'unknown' or ld.wayness != ts.wayness:  # Still nothing; return
        return

    read_rate = numpy.zeros_like(tmid)
    write_rate = numpy.zeros_like(tmid)
    l1_rate = numpy.zeros_like(tmid)
    avx_rate = numpy.zeros_like(tmid)
    sse_rate = numpy.zeros_like(tmid)
    stall_rate = numpy.zeros_like(tmid)
    clock_rate = numpy.zeros_like(tmid)

    for host in ts.j.hosts.keys():
        read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t)
        write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t)
        l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t)
        avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t)
        sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t)
        stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t)
        clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t)

    if float(ts.numhosts * int(ts.wayness) * int(ld.threads)) == 0:
        print 'No tasks in', ts.j.id, ' skipping'
        return

    read_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    write_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    l1_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    avx_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    sse_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    stall_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))
    clock_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads))

    try:
        data_ratio = (read_rate + write_rate) / l1_rate
    except RuntimeWarning:
        print 'Division by zero, skipping:', ts.j.id
        return
    flops = avx_rate + sse_rate
    try:
        flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) -
                                                    numpy.min(flops))
    except RuntimeWarning:
        print 'Division by zero, skipping:', ts.j.id
        return
    try:
        stall_ratio = stall_rate / clock_rate
    except RuntimeWarning:
        print 'Division by zero, skipping:', ts.j.id
        return

    mean_data_ratio = numpy.mean(data_ratio)
    mean_stall_ratio = numpy.mean(stall_ratio)
    mean_mem_rate = numpy.mean(read_rate + write_rate) * 64.0
    if mean_stall_ratio > 1.:
        return
    elif mean_mem_rate > 75. * 1000000000. / 16.:
        return

    ename = ld.exc.split('/')[-1]
    ename = ld.comp_name(ename, ld.equiv_patterns)
    ##  if mean_mem_rate > 2e9: # Put a print in here and investigate bad jobs
    ##    return
    return (ts.j.id, ts.su, ename, mean_data_ratio, mean_stall_ratio,
            mean_mem_rate)
예제 #10
0
def main():

    parser = argparse.ArgumentParser(description='Look for imbalance between'
                                     'hosts for a pair of keys')
    parser.add_argument('filearg',
                        help='File, directory, or quoted'
                        ' glob pattern',
                        nargs='?',
                        default='jobs')
    parser.add_argument('-p',
                        help='Set number of processes',
                        nargs=1,
                        type=int,
                        default=[1])

    n = parser.parse_args()

    filelist = tspl_utils.getfilelist(n.filearg)

    procs = min(len(filelist), n.p[0])

    job = pickle.load(open(filelist[0]))
    jid = job.id
    epoch = job.end_time

    ld = lariat_utils.LariatData(jid,
                                 end_epoch=epoch,
                                 daysback=3,
                                 directory=analyze_conf.lariat_path)

    if procs < 1:
        print 'Must have at least one file'
        exit(1)

    pool = multiprocessing.Pool(processes=procs)

    partial_work = functools.partial(do_work,
                                     mintime=3600.,
                                     wayness=16,
                                     lariat_dict=ld.ld)

    results = pool.map(partial_work, filelist)

    print len(results)

    sus = {}
    for (f_stall, mem_rate, cpi, ename, jid, user, su) in results:
        if f_stall is None:
            continue
        if ename in sus:
            sus[ename] += su
        else:
            sus[ename] = su

    d = collections.Counter(sus)

    enames = zip(*d.most_common(50))[0]

    for k, v in d.most_common(50):
        print k, v

    for (f_stall, mem_rate, cpi, ename, jid, user, su) in results:
        if (f_stall is None) or (not ename in enames):
            continue
        cpec = 1. / (1. - f_stall)
        if cpi > 1.0:  # and cpec > 2.0:
            print jid, ename, cpi, cpec, user, sus[ename]
예제 #11
0
def do_compute(file):
    try:
        ts = tspl.TSPLSum(file, [
            'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb',
            'intel_snb', 'intel_snb', 'intel_snb'
        ], [
            'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256',
            'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE'
        ])

    except tspl.TSPLException as e:
        return

    if not tspl_utils.checkjob(ts, 0, 16):
        return
    elif ts.numhosts < 2:
        print ts.j.id + ': 1 host'
        return

    ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev']
    if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs):
        return

    ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time,
                                 '/scratch/projects/lariatData')
    if ld.exc == 'unknown':
        return

    tmid = (ts.t[:-1] + ts.t[1:]) / 2.0

    read_rate = numpy.zeros_like(tmid)
    write_rate = numpy.zeros_like(tmid)
    l1_rate = numpy.zeros_like(tmid)
    avx_rate = numpy.zeros_like(tmid)
    sse_rate = numpy.zeros_like(tmid)
    stall_rate = numpy.zeros_like(tmid)
    clock_rate = numpy.zeros_like(tmid)

    for host in ts.j.hosts.keys():
        read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t)
        write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t)
        l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t)
        avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t)
        sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t)
        stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t)
        clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t)

    read_rate /= ts.numhosts
    write_rate /= ts.numhosts
    l1_rate /= ts.numhosts
    avx_rate /= ts.numhosts
    sse_rate /= ts.numhosts
    stall_rate /= ts.numhosts
    clock_rate /= ts.numhosts

    data_ratio = (read_rate + write_rate) / l1_rate
    flops = avx_rate + sse_rate
    flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) -
                                                numpy.min(flops))
    stall_ratio = stall_rate / clock_rate

    mean_data_ratio = numpy.mean(data_ratio)
    mean_stall_ratio = numpy.mean(stall_ratio)
    mean_flops = numpy.mean(flops)

    ename = ld.exc.split('/')[-1]
    ename = ld.comp_name(ename, ld.equiv_patterns)
    mean_mem_rate = numpy.mean(read_rate + write_rate)
    if mean_mem_rate > 2e9:  # Put a print in here and investigate bad jobs
        return

    return ','.join([
        ts.j.id, ts.owner, ename,
        str(mean_mem_rate),
        str(mean_stall_ratio),
        str(mean_data_ratio),
        str(mean_flops)
    ])
예제 #12
0
def main():
    mem_rate_thresh = 0.5 * 75 * 1000000000 / 16
    stall_thresh = 0.5
    parser = argparse.ArgumentParser(description='Correlations')
    parser.add_argument('-p',
                        help='Set number of processes',
                        nargs=1,
                        type=int,
                        default=[1])
    parser.add_argument('-n',
                        help='Set number of executables to catalog',
                        nargs=1,
                        type=int,
                        default=[15])
    parser.add_argument('-s',
                        help='Use SUs instead of job counts',
                        action='store_true')
    parser.add_argument('filearg',
                        help='File, directory, or quoted'
                        ' glob pattern',
                        nargs='?',
                        default='jobs')

    n = parser.parse_args()

    filelist = tspl_utils.getfilelist(n.filearg)

    job = pickle.load(open(filelist[0]))
    jid = job.id
    epoch = job.end_time

    ld = lariat_utils.LariatData(jid,
                                 end_epoch=epoch,
                                 daysback=3,
                                 directory=analyze_conf.lariat_path)

    if n.p[0] < 1:
        print 'Must have at least one file'
        exit(1)

    partial_compute = functools.partial(compute_ratio, lariat_dict=ld.ld)

    pool = multiprocessing.Pool(processes=n.p[0])

    res = pool.map(partial_compute, filelist)
    pool.close()
    pool.join()

    mdr = {}
    msr = {}
    mmr = {}
    sus = {}
    for tup in res:
        try:
            (jobid, su, ename, mean_data_ratio, mean_stall_ratio,
             mean_mem_rate) = tup
        except TypeError as e:
            continue
        if ename in mdr:
            mdr[ename] = numpy.append(mdr[ename],
                                      numpy.array([mean_data_ratio]))
            msr[ename] = numpy.append(msr[ename],
                                      numpy.array([mean_stall_ratio]))
            mmr[ename] = numpy.append(mmr[ename], numpy.array([mean_mem_rate]))
            sus[ename] += su
        else:
            mdr[ename] = numpy.array([mean_data_ratio])
            msr[ename] = numpy.array([mean_stall_ratio])
            mmr[ename] = numpy.array([mean_mem_rate])
            sus[ename] = su
        if (mean_mem_rate <= mem_rate_thresh) and \
           (mean_stall_ratio > stall_thresh) :
            print ename, jobid, mean_mem_rate / 1000000000, mean_stall_ratio

    # Find top codes by SUs
    top_count = {}
    for k in mdr.keys():
        if n.s:
            top_count[k] = sus[k]  # by sus
        else:
            top_count[k] = len(mdr[k])  # by count

    d = collections.Counter(top_count)

    mdr2 = {}
    msr2 = {}
    mmr2 = {}
    for k, v in d.most_common(n.n[0]):
        print k, v
        mdr2[k] = numpy.log10(mdr[k])
        msr2[k] = msr[k]
        mmr2[k] = numpy.log10(mmr[k])


#  for k in mdr.keys():
#    if len(mdr[k]) < 5:
#      continue
#    mdr2[k]=mdr[k]

    x = [top_count[k] for k in mdr2.keys()]

    l = len(mdr2.keys())
    y = numpy.linspace(0.10, 0.95, l)
    widths = numpy.interp(x, numpy.linspace(5.0, float(max(x)), l), y)

    fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=80)
    plt.subplots_adjust(hspace=0.35, bottom=0.25)

    ax.boxplot(mdr2.values(), widths=widths)
    xtickNames = plt.setp(ax, xticklabels=mdr2.keys())
    plt.setp(xtickNames, rotation=45, fontsize=8)
    ax.set_ylabel(r'log(DRAM BW/L1 Fill Rate)')

    fname = 'box_mdr'
    fig.savefig(fname)
    plt.close()

    markers = itertools.cycle(('o', 'x', '+', '^', 's', '8', 'p', 'h', '*',
                               'D', '<', '>', 'v', 'd', '.'))

    colors = itertools.cycle(('b', 'g', 'r', 'c', 'm', 'k', 'y'))

    fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80)

    for k in mdr2.keys():
        ax.plot(mdr2[k],
                msr2[k],
                marker=markers.next(),
                markeredgecolor=colors.next(),
                linestyle='',
                markerfacecolor='None')
        ax.hold = True

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    ax.legend(mdr2.keys(),
              bbox_to_anchor=(1.05, 1),
              loc=2,
              borderaxespad=0.,
              numpoints=1)

    ax.set_xlabel('log(DRAM BW/L1 Fill Rate)')
    ax.set_ylabel('Stall Fraction')

    fname = 'msr_v_mdr'
    fig.savefig(fname)
    plt.close()

    markers = itertools.cycle(('o', 'x', '+', '^', 's', '8', 'p', 'h', '*',
                               'D', '<', '>', 'v', 'd', '.'))

    colors = itertools.cycle(('b', 'g', 'r', 'c', 'm', 'k', 'y'))

    fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80)

    for k in mdr2.keys():
        ax.plot(mmr2[k],
                msr2[k],
                marker=markers.next(),
                markeredgecolor=colors.next(),
                linestyle='',
                markerfacecolor='None')
        ax.hold = True

    ax.plot(numpy.log10([mem_rate_thresh, mem_rate_thresh]), [
        0.95 * min(numpy.concatenate(msr2.values())),
        1.05 * max(numpy.concatenate(msr2.values()))
    ], 'r--')

    print[
        min(numpy.concatenate(mmr2.values())),
        max(numpy.concatenate(mmr2.values()))
    ], [stall_thresh, stall_thresh], 'r--'
    ax.plot([
        min(numpy.concatenate(mmr2.values())),
        max(numpy.concatenate(mmr2.values()))
    ], [stall_thresh, stall_thresh], 'r--')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    ax.legend(mdr2.keys(),
              bbox_to_anchor=(1.05, 1),
              loc=2,
              borderaxespad=0.,
              numpoints=1)

    ax.set_xlabel('log(DRAM BW)')
    ax.set_ylabel('Stall Fraction')

    fname = 'msr_v_mem'
    fig.savefig(fname)
    plt.close()
예제 #13
0
def main():

    parser = argparse.ArgumentParser(description='Look for high meta data rate'\
                                     ' to Lustre')
    parser.add_argument('-t',
                        metavar='thresh',
                        help='Treshold metadata rate',
                        nargs=1,
                        default=[100000.])
    parser.add_argument('filearg',
                        help='File, directory, or quoted'
                        ' glob pattern',
                        nargs='?',
                        default='jobs')

    n = parser.parse_args()
    thresh = float(n.t[0])
    print thresh

    filelist = tspl_utils.getfilelist(n.filearg)

    #  k1=['llite', 'llite', 'llite', 'llite', 'llite',
    #      'llite', 'llite', 'llite', 'llite', 'llite',
    #      'llite', 'llite', 'llite', 'llite', 'llite',
    #      'llite', 'llite', 'llite', 'llite', 'llite',
    #      'llite', 'llite', 'llite', 'llite', 'llite',
    #      'llite']
    #  k2=['open','close','mmap','seek','fsync','setattr',
    #      'truncate','flock','getattr','statfs','alloc_inode',
    #      'setxattr','getxattr',' listxattr',
    #      'removexattr', 'inode_permission', 'readdir',
    #      'create','lookup','link','unlink','symlink','mkdir',
    #      'rmdir','mknod','rename',]
    k1 = [
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
        'llite',
    ]
    k2 = [
        'open',
        'close',
        'mmap',
        'fsync',
        'setattr',
        'truncate',
        'flock',
        'getattr',
        'statfs',
        'alloc_inode',
        'setxattr',
        ' listxattr',
        'removexattr',
        'readdir',
        'create',
        'lookup',
        'link',
        'unlink',
        'symlink',
        'mkdir',
        'rmdir',
        'mknod',
        'rename',
    ]

    for file in filelist:
        try:
            ts = tspl.TSPLSum(file, k1, k2)

        except tspl.TSPLException as e:
            continue

        if not tspl_utils.checkjob(ts, 3600., range(1, 33)):
            continue

        tmid = (ts.t[:-1] + ts.t[1:]) / 2.0

        ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, 'lariatData')

        meta_rate = numpy.zeros_like(tmid)

        for k in ts.j.hosts.keys():
            meta_rate += numpy.diff(ts.assemble(range(0, len(k1)), k,
                                                0)) / numpy.diff(ts.t)

        meta_rate /= float(ts.numhosts)

        if numpy.max(meta_rate) > thresh:
            title = ts.title
            if ld.exc != 'unknown':
                title += ', E: ' + ld.exc.split('/')[-1]

            fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80)
            plt.subplots_adjust(hspace=0.35)
            plt.suptitle(title)

            markers = ('o', 'x', '+', '^', 's', '8', 'p', 'h', '*', 'D', '<',
                       '>', 'v', 'd', '.')

            colors = ('b', 'g', 'r', 'c', 'm', 'k', 'y')

            cnt = 0
            for v in ts.data:
                for host in v:
                    for vals in v[host]:
                        rate = numpy.diff(vals) / numpy.diff(ts.t)
                        c = colors[cnt % len(colors)]
                        m = markers[cnt % len(markers)]
                        #            print cnt,(cnt % len(colors)), (cnt % len(markers)), k2[cnt], c, m

                        ax.plot(tmid / 3600.,
                                rate,
                                marker=m,
                                markeredgecolor=c,
                                linestyle='-',
                                color=c,
                                markerfacecolor='None',
                                label=k2[cnt])
                        ax.hold = True
                cnt = cnt + 1

            ax.set_ylabel('Meta Data Rate (op/s)')
            tspl_utils.adjust_yaxis_range(ax, 0.1)

            handles, labels = ax.get_legend_handles_labels()
            new_handles = {}
            for h, l in zip(handles, labels):
                new_handles[l] = h

            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.9, box.height])
            ax.legend(new_handles.values(),
                      new_handles.keys(),
                      prop={'size': 8},
                      bbox_to_anchor=(1.05, 1),
                      borderaxespad=0.,
                      loc=2)

            fname = '_'.join(['metadata', ts.j.id, ts.owner])

            fig.savefig(fname)
            plt.close()
예제 #14
0
def master_plot(file,
                mode='lines',
                threshold=False,
                output_dir='.',
                prefix='graph',
                mintime=3600,
                wayness=16,
                header='Master',
                lariat_dict=None,
                wide=False,
                job_stats=None):
    k1 = {
        'amd64': [
            'amd64_core', 'amd64_core', 'amd64_sock', 'lnet', 'lnet', 'ib_sw',
            'ib_sw', 'cpu'
        ],
        'intel': [
            'intel_pmc3', 'intel_pmc3', 'intel_pmc3', 'lnet', 'lnet', 'ib_ext',
            'ib_ext', 'cpu', 'mem', 'mem'
        ],
        'intel_snb': [
            'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'lnet', 'lnet',
            'ib_sw', 'ib_sw', 'cpu', 'intel_snb', 'intel_snb', 'mem', 'mem'
        ],
    }

    k2 = {
        'amd64': [
            'SSE_FLOPS', 'DCSF', 'DRAM', 'rx_bytes', 'tx_bytes', 'rx_bytes',
            'tx_bytes', 'user'
        ],
        'intel': [
            'MEM_LOAD_RETIRED_L1D_HIT', 'FP_COMP_OPS_EXE_X87',
            'INSTRUCTIONS_RETIRED', 'rx_bytes', 'tx_bytes', 'port_recv_data',
            'port_xmit_data', 'user', 'MemUsed', 'AnonPages'
        ],
        'intel_snb': [
            'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'rx_bytes', 'tx_bytes',
            'rx_bytes', 'tx_bytes', 'user', 'SSE_D_ALL', 'SIMD_D_256',
            'MemUsed', 'AnonPages'
        ],
    }

    try:
        print file
        ts = tspl.TSPLSum(file, k1, k2, job_stats)
    except tspl.TSPLException as e:
        return

    ignore_qs = []  #'gpu','gpudev','vis','visdev']
    if not tspl_utils.checkjob(ts, mintime, wayness, ignore_qs):
        return

    if lariat_dict == None:
        ld = lariat_utils.LariatData(ts.j.id,
                                     end_epoch=ts.j.end_time,
                                     daysback=3,
                                     directory=analyze_conf.lariat_path)
    elif lariat_dict == "pass":
        ld = lariat_utils.LariatData(ts.j.id)
    else:
        ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict)

    wayness = ts.wayness
    if ld.wayness != -1 and ld.wayness < ts.wayness:
        wayness = ld.wayness

    if wide:
        fig, ax = plt.subplots(6, 2, figsize=(15.5, 12), dpi=110)

        # Make 2-d array into 1-d, and reorder so that the left side is blank
        ax = my_utils.flatten(ax)
        ax_even = ax[0:12:2]
        ax_odd = ax[1:12:2]
        ax = ax_odd + ax_even

        for a in ax_even:
            a.axis('off')
    else:
        fig, ax = plt.subplots(6, 1, figsize=(8, 12), dpi=110)

    if mode == 'hist':
        plot = plot_thist
    elif mode == 'percentile':
        plot = plot_mmm
    else:
        plot = plot_lines

    if ts.pmc_type == 'intel_snb':
        # Plot key 1
        plot(ax[0], ts, [8, 9], 3600., 1e9, ylabel='Total AVX +\nSSE Ginst/s')

        # Plot key 2
        plot(ax[1],
             ts, [0, 1],
             3600.,
             1.0 / 64.0 * 1024. * 1024. * 1024.,
             ylabel='Total Mem BW GB/s')

        #Plot key 3
        #plot(ax[2],ts,[2],3600.,1.0/64.0*1e9, ylabel='L1 BW GB/s')
        plot(ax[2],
             ts, [10, -11],
             3600.,
             1024.0 * 1024.0 * 1024.0,
             ylabel='Memory Usage GB',
             do_rate=False)
    elif ts.pmc_type == 'intel':
        plot(ax[0], ts, [1], 3600., 1e9, ylabel='FP Ginst/s')
        plot(ax[2],
             ts, [8, -9],
             3600.,
             1024.0 * 1024.0 * 1024.0,
             ylabel='Memory Usage GB',
             do_rate=False)
    else:
        #Fix this to support the old amd plots
        print ts.pmc_type + ' not supported'
        return

    # Plot lnet sum rate
    plot(ax[3], ts, [3, 4], 3600., 1024.**2, ylabel='Total lnet MB/s')

    # Plot remaining IB sum rate
    if ts.pmc_type == 'intel_snb':
        plot(ax[4],
             ts, [5, 6, -3, -4],
             3600.,
             1024.**2,
             ylabel='Total (ib_sw-lnet) MB/s')
    elif ts.pmc_type == 'intel':
        plot(ax[4],
             ts, [5, 6, -3, -4],
             3600.,
             1024.**2,
             ylabel='Total (ib_ext-lnet) MB/s')

    #Plot CPU user time
    plot(ax[5],
         ts, [7],
         3600.,
         wayness * 100.,
         xlabel='Time (hr)',
         ylabel='Total cpu user\nfraction')

    print ts.j.id + ': '

    plt.subplots_adjust(hspace=0.35)
    if wide:
        left_text = header + '\n' + my_utils.summary_text(ld, ts)
        text_len = len(left_text.split('\n'))
        fontsize = ax[0].yaxis.label.get_size()
        linespacing = 1.2
        fontrate = float(fontsize * linespacing) / 72. / 15.5
        yloc = .8 - fontrate * (text_len - 1
                                )  # this doesn't quite work. fontrate is too
        # small by a small amount
        plt.figtext(.05, yloc, left_text, linespacing=linespacing)
        fname = '_'.join([prefix, ts.j.id, ts.owner, 'wide_master'])
    elif header != None:
        title = header + '\n' + ts.title
        if threshold:
            title += ', V: %(v)-6.1f' % {'v': threshold}
        title += '\n' + ld.title()
        plt.suptitle(title)
        fname = '_'.join([prefix, ts.j.id, ts.owner, 'master'])
    else:
        fname = '_'.join([prefix, ts.j.id, ts.owner, 'master'])

    if mode == 'hist':
        fname += '_hist'
    elif mode == 'percentile':
        fname += '_perc'

    plt.close()

    return fig, fname
예제 #15
0
def main():

    parser = argparse.ArgumentParser(description='Look for imbalance between'
                                     'hosts for a pair of keys')
    parser.add_argument('threshold',
                        help='Treshold ratio for std dev:mean',
                        nargs='?',
                        default=0.25)
    parser.add_argument('key1',
                        help='First key',
                        nargs='?',
                        default='amd64_core')
    parser.add_argument('key2',
                        help='Second key',
                        nargs='?',
                        default='SSE_FLOPS')
    parser.add_argument('filearg',
                        help='File, directory, or quoted'
                        ' glob pattern',
                        nargs='?',
                        default='jobs')
    parser.add_argument('-p',
                        help='Set number of processes',
                        nargs=1,
                        type=int,
                        default=[1])
    parser.add_argument('-o',
                        help='Output directory',
                        nargs=1,
                        type=str,
                        default=['.'],
                        metavar='output_dir')
    #  parser.add_argument('-f', help='Set full mode', action='store_true')
    #  parser.add_argument('-n', help='Disable plots', action='store_true')
    n = parser.parse_args()

    filelist = tspl_utils.getfilelist(n.filearg)
    procs = min(len(filelist), n.p[0])

    job = pickle.load(open(filelist[0]))
    jid = job.id
    epoch = job.end_time

    ld = lariat_utils.LariatData(jid,
                                 end_epoch=epoch,
                                 daysback=3,
                                 directory=analyze_conf.lariat_path)

    if procs < 1:
        print 'Must have at least one file'
        exit(1)

    pool = multiprocessing.Pool(processes=procs)

    partial_imbal = functools.partial(compute_imbalance,
                                      k1=[n.key1],
                                      k2=[n.key2],
                                      thresh=float(n.threshold),
                                      lariat_dict=ld.ld)
    res = pool.map(partial_imbal, filelist)

    pool.close()
    pool.join()

    flagged_jobs = [r for r in res if r]

    print flagged_jobs
    print len(flagged_jobs)

    if len(flagged_jobs) != 0:
        pool = multiprocessing.Pool(processes=min(n.p[0], len(flagged_jobs)))
        pool.map(do_mp,
                 zip(flagged_jobs,
                     [n.o[0]
                      for x in flagged_jobs]))  # Pool.starmap should exist....
        pool.close()
        pool.join()
예제 #16
0
def main():

  parser = argparse.ArgumentParser(description='Look for imbalance between'
                                   'hosts for a pair of keys')
  parser.add_argument('filearg', help='File, directory, or quoted'
                      ' glob pattern', nargs='?',default='jobs')
  parser.add_argument('-p', help='Set number of processes',
                      nargs=1, type=int, default=[1])

  n=parser.parse_args()

  filelist=tspl_utils.getfilelist(n.filearg)

  procs  = min(len(filelist),n.p[0])

  job=pickle.load(open(filelist[0]))
  jid=job.id
  epoch=job.end_time

  ld=lariat_utils.LariatData(jid,end_epoch=epoch,daysback=3,directory=analyze_conf.lariat_path)
  
  if procs < 1:
    print 'Must have at least one file'
    exit(1)
    
  pool = multiprocessing.Pool(processes=procs)

  partial_work=functools.partial(do_work,mintime=3600.,wayness=16,lariat_dict=ld.ld)

  results=pool.map(partial_work,filelist)


  fig1,ax1=plt.subplots(1,1,figsize=(20,8),dpi=80)
  fig2,ax2=plt.subplots(1,1,figsize=(20,8),dpi=80)

  maxx=0.
  for state in [ True, False ]:
    stalls=[]
    misses=[]
    cpis=[]
    enames=[]

    for (s,m,cpi,ename,flag) in results:
      if (s != None and m > 0. and m < 1.0 and flag==state):
        stalls.extend([s])
        misses.extend([m])
        cpis.extend([cpi])
        enames.extend([ename])
        

    markers = itertools.cycle(('o','x','+','^','s','8','p',
                               'h','*','D','<','>','v','d','.'))

    colors  = itertools.cycle(('b','g','r','c','m','k','y'))

    
    fmt={}
    for e in enames:
      if not e in fmt:
        fmt[e]=markers.next()+colors.next()
    
    for (s,c,e) in zip(stalls,cpis,enames):
      #      ax1.plot(numpy.log10(1.-(1.-s)),numpy.log10(c),
      maxx=max(maxx,1./(1.-s))
      ax1.plot((1./(1.-s)),(c),
               marker=fmt[e][0],
               markeredgecolor=fmt[e][1],
                linestyle='', markerfacecolor='None',
               label=e)
      ax1.hold=True
      ax2.plot((1./(1.-s)),(c),
               marker=fmt[e][0],
               markeredgecolor=fmt[e][1],
                linestyle='', markerfacecolor='None',
               label=e)
      ax2.hold=True

    #ax.plot(numpy.log10(stalls),numpy.log10(cpis),fmt)
    #ax.plot(numpy.log10(1.0/(1.0-numpy.array(stalls))),numpy.log10(cpis),fmt)

  ax1.set_xscale('log')
  ax1.set_xlim(left=0.95,right=1.05*maxx)
  ax1.set_yscale('log')
  
  box = ax1.get_position()
  ax1.set_position([box.x0, box.y0, box.width * 0.45, box.height])
  box = ax2.get_position()
  ax2.set_position([box.x0, box.y0, box.width * 0.45, box.height])

  handles=[]
  labels=[]
  for h,l in zip(*ax1.get_legend_handles_labels()):
    if l in labels:
      continue
    else:
      handles.extend([h])
      labels.extend([l])
    
  
  ax1.legend(handles,labels,bbox_to_anchor=(1.05, 1),
            loc=2, borderaxespad=0., numpoints=1,ncol=4)
  ax1.set_xlabel('log(Cycles per Execution Cycle)')
  ax1.set_ylabel('log(CPI)')

  handles=[]
  labels=[]
  for h,l in zip(*ax2.get_legend_handles_labels()):
    if l in labels:
      continue
    else:
      handles.extend([h])
      labels.extend([l])
    
  
  ax2.legend(handles,labels,bbox_to_anchor=(1.05, 1),
            loc=2, borderaxespad=0., numpoints=1,ncol=4)
  ax2.set_xlabel('Cycles per Execution Cycle')
  ax2.set_ylabel('CPI')

  fname='miss_v_stall_log'
  fig1.savefig(fname)

  fname='miss_v_stall'
  fig2.savefig(fname)

  plt.close()
예제 #17
0
def main():

    parser = argparse.ArgumentParser(
        description='Plot important stats for jobs')
    parser.add_argument('-m',
                        help='Plot mode: lines, hist, percentile',
                        nargs=1,
                        type=str,
                        default=['lines'],
                        metavar='mode')
    parser.add_argument('-o',
                        help='Output directory',
                        nargs=1,
                        type=str,
                        default=['.'],
                        metavar='output_dir')
    parser.add_argument('filearg',
                        help='File, directory, or quoted'
                        ' glob pattern',
                        nargs='?',
                        default='jobs')
    parser.add_argument('-p',
                        help='Set number of processes',
                        nargs=1,
                        type=int,
                        default=[1])
    parser.add_argument('-s',
                        help='Set minimum time in seconds',
                        nargs=1,
                        type=int,
                        default=[3600])
    parser.add_argument('-w', help='Set wide plot format', action='store_true')
    n = parser.parse_args()

    filelist = tspl_utils.getfilelist(n.filearg)
    procs = min(len(filelist), n.p[0])

    job = pickle.load(open(filelist[0]))
    jid = job.id
    epoch = job.end_time

    ld = lariat_utils.LariatData(jid,
                                 end_epoch=epoch,
                                 daysback=3,
                                 directory=analyze_conf.lariat_path)

    if procs < 1:
        print 'Must have at least one file'
        exit(1)

    pool = multiprocessing.Pool(processes=procs)

    partial_master = functools.partial(mp_wrapper,
                                       mode=n.m[0],
                                       threshold=False,
                                       output_dir=n.o[0],
                                       prefix='graph',
                                       mintime=n.s[0],
                                       wayness=[x + 1 for x in range(16)],
                                       lariat_dict=ld.ld,
                                       wide=n.w)

    pool.map(partial_master, filelist)

    pool.close()
    pool.join()