Пример #1
0
def update_jct(workload, update_global=False):
    # type: (Workload, bool) -> None
    """Parse and update JCT value of a completed workload"""
    if workload.proc is None or workload.proc.returncode != 0:
        raise ValueError(f'Workload {workload.name} not started or terminated in error')

    jct = parse_output_float(workload.output_file, r'^JCT: ([0-9.]+) .*')
    workload.geometry.jct = jct
    if update_global:
        WTL.from_name(workload.name).add_geometry(workload.rcfg, workload.executor, ResourceGeometry(jct=jct))
Пример #2
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 20
    if network == "speech":
        batch_num = 5

    logger.info(f'Measuring memory for {network}_{batch_size} for {batch_num} iter')

    ex = "salus" if FLAGS.use_salus else "tf"
    final_dst = logdir / ex / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))
    with atomic_directory(final_dst) as outputdir:
        if not FLAGS.use_salus:
            logger.info('    Running on TF')
            wl = WTL.create(network, batch_size, batch_num, Executor.TF)
            wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '1'
            wl.env['TF_CPP_MIN_LOG_LEVEL'] = ''
            run_tf(outputdir, wl)
            # filter and move file to a more convinent name
            for f in pathlib.Path(outputdir).iterdir():
                with f.with_name('alloc.output').open('w') as file:
                    grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent))
                    grep.wait()
                f.unlink()
                break
        else:
            scfg = maybe_forced_preset(presets.AllocProf)
            scfg.logconf = "memop"
            scfg.output_dir = outputdir
            server = SalusServer(scfg)
            with server.run():
                logger.info('    Running on Salus')
                WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output')

    return final_dst
Пример #3
0
def gen_workload_list(selection):
    # type: (str) -> Iterable[Tuple[WTL, RunConfig]]
    """Select workloads based on commandline"""
    if not selection:
        blacklist = ['speech', 'seq2seq', 'mnistlg', 'mnistsf', 'mnistcv']
        names = (
            (v, bs)
            for k, v in WTL.known_workloads.items()
            for bs in v.available_batch_sizes()
            if k not in blacklist
        )
    else:
        names = []
        for cname in unique((cname for cname in selection.split(',')), stable=True):
            if '_' not in cname:
                raise UsageError(f"Not a canonical name: {cname}")
            name, bs = cname.split('_', 1)
            bs = try_with_default(int, bs, ValueError)(bs)
            names.append((WTL.from_name(name), bs))

    # Find all available batch_num with JCT and mem data
    return (
        (wtl, RunConfig(bs, bn, None))
        for wtl, bs in names
        for bn in wtl.available_batch_nums(bs)
    )
Пример #4
0
def do_jct(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 20

    final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))
    with atomic_directory(final_dst) as outputdir:
        logger.info(f'Measuring basic JCT for {batch_num} iterations')
        mps_name = '-mps' if FLAGS.is_mps else ''
        if not (final_dst/'gpu{}.output'.format(mps_name)).exists() or not FLAGS.resume:
            logger.info('    Running on TF')
            WTL.block_run(network, batch_size, batch_num, Executor.TF, outputdir / 'gpu{}.output'.format(mps_name))

        if FLAGS.do_tfdist:
            if not (final_dst/'tfdist{}.output'.format(mps_name)).exists() or not FLAGS.resume:
                with TFDistServer().run():
                    logger.info('    Running on TFDist')
                    WTL.block_run(network, batch_size, batch_num, Executor.TFDist, outputdir / 'tfdist{}.output'.format(mps_name))

        if FLAGS.is_mps:
            logger.info('    Skipping Salus jct when MPS is on')
            return final_dst

        if not (final_dst / 'rpc.output').exists() or not FLAGS.resume:
            scfg = maybe_forced_preset(presets.MostEfficient)
            scfg.output_dir = outputdir
            server = SalusServer(scfg)
            with server.run():
                logger.info('    Warming up Salus')
                # always use 20 batch num when warming up
                WTL.block_run(network, batch_size, 20, Executor.Salus, outputdir / 'rpc-warm.output')

                logger.info('    Running on Salus')
                WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output')

    return final_dst
Пример #5
0
 def getbs(name):
     if '_' in name:
         name, bs = name.split('_')
         bs = int(bs)
         return [(name, bs)]
     else:
         bss = WTL.from_name(name).available_batch_sizes()
         names = [name] * len(bss)
         return zip(names, bss)
Пример #6
0
 def getbs(name):
     if '_' in name:
         name, bs = name.split('_')
         bs = int(bs)
         return [(name, bs)]
     else:
         # using a single batch size is enough
         bss = WTL.from_name(name).available_batch_sizes()
         return [(name, list(bss)[0])]
Пример #7
0
 def expandbs(name):
     if '_' in name:
         name, bs = name.split('_')
         return [(name, int(bs))]
     else:
         avail = WTL.from_name(name).available_batch_sizes()
         if batch_size is None:
             bss = avail
         else:
             bss = [bs for bs in batch_size if bs in avail]
         return zip([name] * len(bss), bss)
Пример #8
0
def find_geometry(w, field):
    """
    :type w: Workload
    :type field: str
    """
    if w.geometry[field] is not None:
        return w.geometry[field]

    # check for another bn
    for bn in w.wtl.available_batch_nums(w.batch_size):
        g = WTL.from_name(w.name).geometry(RunConfig(w.batch_size, bn, None), w.executor)
        if g[field] is not None:
            w.geometry[field] = g[field]
            return g[field]

    return None
Пример #9
0
def select_workloads(argv):
    # type: (Iterable[str]) -> Iterable[(str, TBatchSize)]
    """Select workloads based on commandline"""
    if not argv:
        names = WTL.known_workloads.keys()
    else:
        names = unique((
            name
            for piece in argv
            for name in piece.split(',')
        ), stable=True)

    # TODO: return directly WTL instances
    return [(name, batch_size)
            for name in names
            for batch_size in WTL.from_name(name).available_batch_sizes()]
Пример #10
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 10

    logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter')

    final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))

    with atomic_directory(final_dst) as outputdir:
        logger.info('    Running on TF')
        wl = WTL.create(network, batch_size, batch_num, Executor.TF)
        wl.env['SALUS_SAVE_MODEL'] = '1'

        model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
        model_dir = model_dir.expanduser().resolve()
        wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir)

        run_tf(outputdir, wl)
    return final_dst
Пример #11
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 10

    logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter')

    final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))

    with atomic_directory(final_dst) as outputdir:
        logger.info('    Running on TF')
        wl = WTL.create(network, batch_size, batch_num, Executor.TF)
        wl.env['SALUS_SAVE_MODEL'] = '1'
        run_tf(outputdir, wl)
        # filter and move file to a more convinent name
        for f in pathlib.Path(outputdir).iterdir():
            with f.with_name('alloc.output').open('w') as file:
                grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent))
                grep.wait()
            f.unlink()
            break
    return final_dst
Пример #12
0
def main(argv):
    # type: (Sequence[str]) -> None
    scfg = maybe_forced_preset(presets.MostEfficient)
    scfg.scheduler = 'pack'
    scfg.disable_adc = True

    if argv:
        run_seq(scfg.copy(output_dir=FLAGS.save_dir),
                *parse_actions_from_cmd(argv))
        return

    wtl = WTL.from_name('alexnet')
    rcfg = RunConfig(25, 2726, None)

    # check if we have reference JCT
    reference_jct = wtl.geometry(rcfg, Executor.Salus).jct

    if reference_jct is None:
        start_from = 1
        logger.warning(f"No reference JCT data available for `{wtl.canonical_name(rcfg)}'")
    else:
        start_from = 2
        report(1, reference_jct, 1)

    logger.info(f'Will stop when JCT degratation larger than {FLAGS.break_when}')
    for concurrent in range(start_from, FLAGS.uplimit):
        # run them at once
        logger.info(f'Runing {concurrent} workloads together')
        workloads = [wtl.create_from_rcfg(rcfg) for _ in range(concurrent)]
        run_seq(scfg.copy(output_dir=FLAGS.save_dir / f"{concurrent}"), *workloads)

        # calculate average jct
        for w in workloads:
            update_jct(w)
        jcts = [w.geometry.jct for w in workloads]
        avgjct = np.mean(jcts)  # type: float
        ratio = avgjct / reference_jct
        report(concurrent, avgjct, ratio)
        if ratio > FLAGS.break_when:
            break