def case1(): for rate in rates: wl = WTL.create("inception3eval", 1, 500, executor=Executor.TFDist) wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = str(1 / rate) wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '1' wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'false' run_tfdist(FLAGS.save_dir/'case1'/str(rate), wl)
def case1(): scfg = maybe_forced_preset(presets.MostEfficient) run_tfdist(FLAGS.save_dir/'case1', WTL.create("inception3eval", 50, 50, executor=Executor.TF), WTL.create("inception3eval", 50, 50, executor=Executor.TFDist) ) run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3eval", 50, 50))
def case3(argv): model, bs, bn = 'resnet50', 50, 500 name = inspect.currentframe().f_code.co_name # first run one along to get JCT run_tfdist(FLAGS.save_dir/name, WTL.create(model, bs, bn, executor=Executor.TFDist)) # create 300 vae scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' wls = [WTL.create(model, bs, bn) for _ in range(300)] run_seq(scfg.copy(output_dir=FLAGS.save_dir/name), *wls)
def tfdist(argv): # type: (Sequence[str]) -> None name = "alexneteval" if len(argv) > 1: name = argv[0] batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] batch_num = 300 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job, the batch number has no effect here, # only used to distinguish different runs trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TFDist) # make sure it runs long enough trainWl.env['SALUS_ITER_SECONDS'] = '300' # create a pipe to signal trainWl pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) os.mkfifo(pipetrain) trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain # create the foreground inference job wl = WTL.create(name, bs, batch_num, executor=Executor.TFDist) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' pipe = str(pathlib.Path(td).joinpath('fifo')) os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe run_tfdist(FLAGS.save_dir / "tfdist" / (name + "-inception4"), trainWl, # start the background job wl, # start the foreground job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def case3(): run_tfdist(FLAGS.save_dir/'case3', WTL.create("inception3eval", 1, 1000, executor=Executor.TFDist))
def test(): run_tfdist(FLAGS.save_dir, WTL.create("inception4", 25, 1, executor=Executor.TFDist), Pause.Wait, WTL.create("inception3", 50, 1, executor=Executor.TFDist))