def make_experiment(exp_name, zip_project=False, track_git=True): # create experiment :: need to add any information here that is relevant. assert not exp_exists(exp_name), """experiment with this name already exists, either remove the existing version or rename the new launch.""" arg_dict = {} expID = make_id(32) arg_dict['expID'] = expID arg_dict['script'] = sys.argv[0] # TODO: check that this is the correct approach. _logger.configure(LOG_ROOT, prefix=exp_name) if zip_project: dir_zip(PROJECT_ROOT, output_file='source.zip', excludes=["*.ckpt*", "*tmp_dir*", "*.mp4", "*.png", "*data*", "*.pkl", "*.git*"]) shutil.move('source.zip', os.path.join(LOG_ROOT, exp_name + '/' + 'source.zip')) if track_git: arg_dict['gitcommit'] = get_gitcommit() arg_dict['gitbranch'] = get_gitbranch() _logger.log_params(Args={'expID': expID}) timestamp = datetime.datetime.now().strftime("%x :: %X") exp_doc = { 'expID': expID, 'expName': exp_name, 'logdir': os.path.join(LOG_ROOT, exp_name), 'time': timestamp } coll = get_exp_coll() coll.insert(exp_doc) return expID
def adapt_and_test(): import os import dill from playground.maml.maml_torch.maml_multi_step import FunctionalMLP logger.configure(log_directory=Args.log_dir, prefix=Args.log_prefix) logger.log_params(Args=vars(Args)) # load weights with open(os.path.join(Args.log_dir, Args.log_prefix, Args.weight_path), 'rb') as f: weights = dill.load(f) model = FunctionalMLP(1, 1) losses = DefaultBear(list) for amp, task in amp_tasks: model.params.update({ k: t.tensor(v, requires_grad=True, dtype=t.double).to(device) for k, v in weights[0].items() }) sgd = t.optim.SGD(model.parameters(), lr=Args.learning_rate) proper = t.tensor(task.proper()).to(device) samples = t.tensor(task.samples(Args.k_shot)).to(device) for grad_ind in range(Args.grad_steps): with t.no_grad(): xs, labels = proper ys = model(xs.unsqueeze(-1)) loss = model.criteria(ys, labels.unsqueeze(-1)) logger.log(grad_ind, loss=loss.item(), silent=grad_ind != Args.grad_steps - 1) losses[f"amp-{amp:.2f}-loss"].append(loss.item()) xs, labels = samples ys = model(xs.unsqueeze(-1)) loss = model.criteria(ys, labels.unsqueeze(-1)) sgd.zero_grad() loss.backward() sgd.step() # losses = np.array([v for k, v in losses.items()]) import matplotlib.pyplot as plt fig = plt.figure() plt.title(f'Learning Curves') for amp, task in amp_tasks: plt.plot(losses[f"amp-{amp:.2f}-loss"], label=f"amp {amp:.2f}") plt.legend() logger.log_pyplot(None, key=f"losses/learning_curves_amp.png", fig=fig) plt.close() average_losses = np.array( [losses[f"amp-{amp:.2f}-loss"] for amp, task in amp_tasks]) fig = plt.figure() plt.title(f'Learning Curves Averaged amp ~ [5 - 10]') plt.plot(average_losses.mean(0)) plt.ylim(0, 28) logger.log_pyplot(None, key=f"losses/learning_curves_amp_all.png", fig=fig) plt.close()
def test(setup): d = Color(3.1415926, 'red') s = "{:.1}".format(d) print(s) logger.log_params(G=dict(some_config="hey")) logger.log(step=0, some=Color(0.1, 'yellow')) logger.log(step=1, some=Color(0.28571, 'yellow', lambda v: "{:.5f}%".format(v * 100))) logger.log(step=2, some=Color(0.85, 'yellow', percent)) logger.log({"some_var/smooth": 10}, some=Color(0.85, 'yellow', percent), step=3) logger.log(step=4, some=Color(10, 'yellow'))
def launch_maml_mlp(log_prefix=None, **_G): G.log_prefix = log_prefix or f'{now:%Y-%m-%d}/debug-maml-baselines/sinusoid-maml-mlp' G.update(_G) logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G)) np.random.seed(G.seed) t.manual_seed(G.seed) t.cuda.manual_seed(G.seed) maml(test_fn=standard_sine_test)
def launch_maml_lstm(log_prefix=None, **_G): G.log_prefix = log_prefix or f'{now:%Y-%m-%d}/debug-maml-baselines/sinusoid-maml-lstm' G.update(_G) logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G)) np.random.seed(G.seed) t.manual_seed(G.seed) t.cuda.manual_seed(G.seed) auto_rnn = FunctionalLSTM(1, 1, 10) maml(model=auto_rnn, test_fn=standard_sine_test)
def launch_reptile_auto_gru(log_prefix=None, **_G): G.log_prefix = log_prefix or f'{now:%Y-%m-%d}/debug-maml-baselines/sinusoid-reptile-auto-gru' G.update(_G) logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G)) np.random.seed(G.seed) t.manual_seed(G.seed) t.cuda.manual_seed(G.seed) auto_rnn = FunctionalAutoGRU(1, 1, 10) reptile(model=auto_rnn, test_fn=standard_sine_test)
def run_e_maml(): # print(config.RUN.log_directory) # if config.G.run_mode == "e_maml": # print('{G.inner_alg} E-MAML'.format(G=config.G)) # elif config.G.run_mode == "maml": # print('{G.inner_alg} Vanilla MAML'.format(G=config.G)) # todo: let's take the control of the log director away from the train script. It should all be set from outside. logger.configure(log_directory=config.RUN.log_directory, prefix=f"run_maml-{config.G.seed}") logger.log_params(RUN=vars(config.RUN), G=vars(config.G), Reporting=vars(config.Reporting), DEBUG=vars(config.DEBUG)) import sys print(" ".join(sys.argv)) tasks = MetaRLTasks(env_name=config.G.env_name, batch_size=config.G.n_parallel_envs, start_seed=config.G.start_seed, task_seed=config.G.task_seed, log_directory=(config.RUN.log_directory + "/{seed}") if config.G.render else None, max_steps=config.G.env_max_timesteps) test_tasks = MetaRLTasks(env_name=config.G.env_name, batch_size=config.G.n_parallel_envs, start_seed=config.G.test_start_seed, task_seed=config.G.test_task_seed, log_directory=(config.RUN.log_directory + "/{seed}") if config.G.render else None, max_steps=config.G.env_max_timesteps) if config.G.eval_test_interval \ else ExitStack() # with Dashboard(config.RUN.prefix, server=config.Reporting.plot_server, # port=config.Reporting.plot_server_port) as dash, U.single_threaded_session(), tasks, test_tasks: with U.make_session(num_cpu=config.G.n_cpu), tasks, test_tasks: # logger.on_dumpkvs(make_plot_fn(dash)) maml = E_MAML(ob_space=tasks.envs.observation_space, act_space=tasks.envs.action_space) summary = tf.summary.FileWriter(config.RUN.log_directory, tf.get_default_graph()) summary.flush() trainer = Trainer() U.initialize() trainer.train(tasks=tasks, maml=maml, test_tasks=test_tasks) # logger.clear_callback() tf.reset_default_graph()
def torch_upload(): from ml_logger import logger import numpy as np logger.configure(root_dir="http://54.71.92.65:9080", prefix="geyang/ml_logger-debug/test-1", register_experiment=True) logger.log_params(args={}) with logger.Sync(): import os import torch from pycurl import Curl from tempfile import NamedTemporaryFile logger.remove('upload/example.pt') with NamedTemporaryFile(delete=True) as f: torch.save(np.ones([10_000_000]), f) # torch.save(np.ones([1000_000]), f) logger.print(f.name) c = Curl() c.setopt(c.URL, logger.root_dir) # proxy = os.environ.get('HTTP_PROXY') # c.setopt(c.PROXY, proxy) # logger.print('proxy:', proxy) c.setopt(c.TIMEOUT, 100000) c.setopt(c.HTTPPOST, [ ('file', ( c.FORM_FILE, f.name, c.FORM_FILENAME, logger.prefix + '/upload/example.pt', c.FORM_CONTENTTYPE, 'plain/text', )), ]) c.perform() c.close() logger.print('done') # logger.remove(".") # a = np.ones([1, 1, 100_000_000 // 4]) # logger.print(f"the size of the tensor is {a.size}") # data = dict(key="ok", large=a) # logger.torch_save(data, f"save/data-{logger.now('%H.%M.%S')}.pkl") logger.print('done')
def run_e_maml(_G=None): import baselines.common.tf_util as U if _G is not None: config.G.update(_G) for k, v in [ *vars(config.RUN).items(), *vars(config.G).items(), *vars(config.Reporting).items(), *vars(config.DEBUG).items() ]: comet_logger.log_parameter(k, v) # todo: let's take the control of the log director away from the train script. It should all be set from outside. logger.configure(log_directory=config.RUN.log_dir, prefix=config.RUN.log_prefix) logger.log_params(RUN=vars(config.RUN), G=vars(config.G), Reporting=vars(config.Reporting), DEBUG=vars(config.DEBUG)) logger.log_file(__file__) tasks = MetaRLTasks(env_name=config.G.env_name, batch_size=config.G.n_parallel_envs, start_seed=config.G.start_seed, log_directory=(config.RUN.log_directory + "/{seed}") if config.G.render else None, max_steps=config.G.env_max_timesteps) # sess_config = tf.ConfigProto(log_device_placement=config.Reporting.log_device_placement) # with tf.Session(config=sess_config), tf.device('/gpu:0'), tasks: graph = tf.Graph() with graph.as_default(), U.make_session(num_cpu=config.G.n_cpu), tasks: maml = E_MAML(ob_space=tasks.envs.observation_space, act_space=tasks.envs.action_space) comet_logger.set_model_graph(tf.get_default_graph()) # writer = tf.summary.FileWriter(logdir='/opt/project/debug-graph', graph=graph) # writer.flush() # exit() trainer = Trainer() U.initialize() trainer.train(tasks=tasks, maml=maml) logger.flush() tf.reset_default_graph()
def _fit_by_cv_and_eval(estimator_key, conf_dict): estimator, param_grid, param_dict_init = _initialize_model_cv(estimator_key, conf_dict, verbose=True) # 1) perform cross-validation hyperparam search to select params selected_params = estimator.fit_by_cv(X_train, Y_train, param_grid=param_grid, n_folds=n_folds, n_jobs=n_jobs_inner, random_state=rds) logger.log("%s selected params:"%estimator_key) logger.log_params(**selected_params) # 2) evaluate selected params with different initializations param_dict_init.update(selected_params) logger.log("evaluating %s parameters with %i seeds"%(estimator_key, len(eval_seeds))) scores = _evaluate_params(estimator.__class__, param_dict_init, X_train, Y_train, X_valid, Y_valid, seeds=eval_seeds) cv_result_dict[estimator_key] = {'selected_params': selected_params, 'scores': scores, 'eval_seeds': eval_seeds} logger.log("evaluation scores for %s: %s" % (estimator_key, str(scores)))
def launch(**_G): import matplotlib matplotlib.use('Agg') G.update(_G) import numpy as np np.random.seed(G.seed) t.manual_seed(G.seed) t.cuda.manual_seed(G.seed) logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G)) model = Model(**vars(G)) from playground.maml.maml_torch.tasks import Sine maml_supervised(model, Sine, **vars(G))
def run_maml(_G=None): if _G is not None: G.update(_G) for k, v in vars(G).items(): comet_logger.log_parameter(k, v) # todo: let's take the control of the log director away from the train script. It should all be set from outside. logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G), ) logger.log_file(__file__) tasks = MetaRLTasks(env_name=G.env_name, batch_size=G.n_parallel_envs, start_seed=G.start_seed, max_steps=G.env_max_timesteps) env = tasks.sample() print(env)
def launch_training(): from playground.maml.maml_torch.maml_multi_step import maml, G np.random.seed(G.seed) t.manual_seed(G.seed) t.cuda.manual_seed(G.seed) from datetime import datetime now = datetime.now() G.log_prefix = f"{now:%Y-%m-%d}/new-maml-torch/out-of-distribution" G.n_epochs = 70000 # from cbfinn universality paper G.n_gradient_steps = 5 G.test_grad_steps = [1, 5] G.test_interval = 5 G.save_interval = 100 # save the weights ever 100 epoch. logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G)) maml(test_fn=all_tests)
def start_run(exp_name, params, runID=None): coll = get_exp_coll() if runID is None: runID = make_id(8) run_tstamp = datetime.datetime.now().strftime("%x :: %X") script_txt = ' '.join(sys.argv) run_doc = { 'runID': runID, 'params': params, 'runpath': os.path.join(LOG_ROOT, exp_name + '/' + runID), 'script': sys.argv[0], 'command': script_txt, 'timestamp': run_tstamp } _logger.configure(LOG_ROOT, prefix=exp_name + '/' + runID) coll.update_one({'expName': exp_name}, {'$push': {'runs': run_doc}}) _logger.log_params(Args=params) return _logger
def thunk(*args, **kwargs): import traceback from ml_logger import logger assert not (args and ARGS), \ f"can not use position argument at both thunk creation as well as run.\n" \ f"_args: {args}\n" \ f"ARGS: {ARGS}\n" logger.configure(root_dir=RUN.server, prefix=PREFIX, register_experiment=False, max_workers=10) logger.log_params(host=dict(hostname=logger.hostname), run=dict(status="running", startTime=logger.now(), job_id=logger.job_id)) import time try: _KWARGS = {**KWARGS} _KWARGS.update(**kwargs) results = fn(*(args or ARGS), **_KWARGS) logger.log_line("========== execution is complete ==========") logger.log_params( run=dict(status="completed", completeTime=logger.now())) logger.flush() time.sleep(3) except Exception as e: tb = traceback.format_exc() with logger.SyncContext( ): # Make sure uploaded finished before termination. logger.print(tb, color="red") logger.log_text(tb, filename="traceback.err") logger.log_params( run=dict(status="error", exitTime=logger.now())) logger.flush() time.sleep(3) raise e return results
def _(*args, **kwargs): import traceback from ml_logger import logger assert not (args and ARGS), f"can not use position argument at both thunk creation as well as " \ f"run.\n_args: {args}\nARGS: {ARGS}" logger.configure(log_directory=RUN.server, prefix=PREFIX, register_experiment=False, max_workers=10) logger.log_params(host=dict(hostname=logger.hostname), run=dict(status="running", startTime=logger.now())) try: _KWARGS = KWARGS.copy() _KWARGS.update(kwargs) fn(*(args or ARGS), **_KWARGS) logger.log_line("========= execution is complete ==========") logger.log_params( run=dict(status="completed", completeTime=logger.now())) except Exception as e: import time time.sleep(1) tb = traceback.format_exc() with logger.SyncContext( ): # Make sure uploaded finished before termination. logger.log_text(tb, filename="traceback.err") logger.log_params( run=dict(status="error", exitTime=logger.now())) logger.log_line(tb) logger.flush() time.sleep(30) raise e import time time.sleep(30)
args = [ dict(x=x, y=y, filename=f"images/{x:0.3f},{y:0.3f}.png") for x in np.linspace(-0.25, 0.25, 128) ] logger.log_data(args, 'index.pkl') for p in tqdm(args): x, y, filename = p['x'], p['y'], p['filename'] env.set_state(np.array([x, y, 0, 0]), np.array([0, 0, 0, 0])) # env.do_simulation([0, 0], 1) # PointMass does not need this. image = env.render('grey', width=20, height=20) frames.append(image) logger.log_image(image, filename) print('saving video') logger.log_video(frames, f"{env_name}.mp4") print('done') if __name__ == "__main__": import os from ml_logger import logger logger.log_params(some_namespace=dict(layer=10, learning_rate=0.0001)) exit() # logger.configure(log_directory="/tmp/learning-to-imitate", prefix="envs") logger.configure(log_directory=os.path.abspath("../datasets"), prefix="") # logger.configure(log_directory="http://54.71.92.65:8081", prefix="debug/many_world/") [render(env) for env in envs]
def train(): from moleskin import moleskin as M M.tic('Full Run') if G.model == "lenet": model = Conv2d() elif G.model == 'mlp': model = Mlp() else: raise NotImplementedError('only lenet and mlp are allowed') model.train() print(model) G.log_prefix = f"mnist_{type(model).__name__}" logger.configure(log_directory=G.log_dir, prefix=G.log_prefix) logger.log_params(G=vars(G), Model=dict(architecture=str(model))) from torchvision import datasets, transforms trans = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, ), (1.0, ))]) train_set = datasets.MNIST(root=G.data_dir, train=True, transform=trans, download=True) test_set = datasets.MNIST(root=G.data_dir, train=False, transform=trans, download=True) train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=G.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=G.batch_size, shuffle=False) celoss = nn.CrossEntropyLoss() adam = optim.SGD(model.parameters(), lr=G.learning_rate, momentum=0.9) for epoch in range(G.n_epochs): for it, (x, target) in enumerate(train_loader): adam.zero_grad() ys = model(x) loss = celoss(ys, target) loss.backward() adam.step() if it % G.test_interval == 0: with h.Eval(model), torch.no_grad(): accuracy = h.Average() for x, label in test_loader: acc = h.cast( h.one_hot_to_int(model(x).detach()) == label, float).sum() / len(x) accuracy.add(acc.detach().numpy()) logger.log(float(epoch) + it / len(train_loader), accuracy=accuracy.value) M.split("epoch") # logger.log(epoch, it=it, loss=loss.detach().numpy()) M.toc('Full Run')
def train(deps=None, **kwargs): from ml_logger import logger from dmc_gen.config import Args Args._update(deps, **kwargs) logger.log_params(Args=vars(Args)) utils.set_seed_everywhere(Args.seed) wrappers.VideoWrapper.prefix = wrappers.ColorWrapper.prefix = DMCGEN_DATA # Initialize environments image_size = 84 if Args.algo == 'sac' else 100 env = wrappers.make_env( domain_name=Args.domain, task_name=Args.task, seed=Args.seed, episode_length=Args.episode_length, action_repeat=Args.action_repeat, image_size=image_size, ) test_env = wrappers.make_env(domain_name=Args.domain, task_name=Args.task, seed=Args.seed + 42, episode_length=Args.episode_length, action_repeat=Args.action_repeat, image_size=image_size, mode=Args.eval_mode) # Prepare agent cropped_obs_shape = (3 * Args.frame_stack, 84, 84) agent = make_agent(algo=Args.algo, obs_shape=cropped_obs_shape, act_shape=env.action_space.shape, args=Args).to(Args.device) if Args.load_checkpoint: print('Loading from checkpoint:', Args.load_checkpoint) logger.load_module(agent, path="models/*.pkl", wd=Args.load_checkpoint, map_location=Args.device) replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=Args.train_steps, batch_size=Args.batch_size) episode, episode_reward, episode_step, done = 0, 0, 0, True logger.start('train') for step in range(Args.start_step, Args.train_steps + 1): if done: if step > Args.start_step: logger.store_metrics({'dt_epoch': logger.split('train')}) logger.log_metrics_summary(dict(step=step), default_stats='mean') # Evaluate agent periodically if step % Args.eval_freq == 0: logger.store_metrics(episode=episode) with logger.Prefix(metrics="eval/"): evaluate(env, agent, Args.eval_episodes, save_video=f"videos/{step:08d}_train.mp4") with logger.Prefix(metrics="test/"): evaluate(test_env, agent, Args.eval_episodes, save_video=f"videos/{step:08d}_test.mp4") logger.log_metrics_summary(dict(step=step), default_stats='mean') # Save agent periodically if step > Args.start_step and step % Args.save_freq == 0: with logger.Sync(): logger.save_module(agent, f"models/{step:06d}.pkl") if Args.save_last: logger.remove(f"models/{step - Args.save_freq:06d}.pkl") # torch.save(agent, os.path.join(model_dir, f'{step}.pt')) logger.store_metrics(episode_reward=episode_reward, episode=episode + 1, prefix="train/") obs = env.reset() episode_reward, episode_step, done = 0, 0, False episode += 1 # Sample action for data collection if step < Args.init_steps: action = env.action_space.sample() else: with utils.Eval(agent): action = agent.sample_action(obs) # Run training update if step >= Args.init_steps: num_updates = Args.init_steps if step == Args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, step) # Take step next_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) replay_buffer.add(obs, action, reward, next_obs, done_bool) episode_reward += reward obs = next_obs episode_step += 1 logger.print( f'Completed training for {Args.domain}_{Args.task}/{Args.algo}/{Args.seed}' )
fn = lambda x: np.random.rand() + (1 + 0.001 * x) * np.sin(x * 0.1 / np.pi) fn_1 = lambda x: np.random.rand() + (1 + 0.001 * x) * np.sin(x * 0.04 / np. pi) for username in ["episodeyang", "amyzhang"]: for project in ['cpc-belief', 'playground']: for i in range(10): prefix = f"{DEBUG_DIR}/{username}/{project}/{'mdp/' if i < 5 else '/'}experiment_{i:02d}" logger.remove(prefix) logger.configure(prefix=prefix) logger.log_params(Args=dict(lr=10**(-2 - i), weight_decay=0.001, gradient_clip=0.9, env_id="GoalMassDiscreteIdLess-v0", seed=int(i * 100))) for ep in range(500 + 1): logger.log_metrics(epoch=ep, sine=fn(ep), slow_sine=fn_1(ep)) logger.flush() if ep % 10 == 0: logger.log_image(face('gray'), f"figures/gray_{ep:04d}.png") logger.log_image(face('rgb'), f"figures/rgb_{ep:04d}.png") logger.log_image(face('gray'), "figures/face_gray.png") logger.log_image(face('rgb'), "figures/face_rgb.png")
def instr(fn, *ARGS, __file=False, __silent=False, **KWARGS): """ thunk for configuring the logger. The reason why this is not a decorator is :param fn: function to be called :param *ARGS: position arguments for the call :param __file__: console mode, by-pass file related logging :param __silent: do not print :param **KWARGS: keyword arguments for the call :return: a thunk that can be called without parameters """ from ml_logger import logger if __file: caller_script = pJoin(os.getcwd(), __file) else: launch_module = inspect.getmodule(inspect.stack()[1][0]) __file = launch_module.__file__ caller_script = abspath(__file) # note: for scripts in the `plan2vec` module this also works -- b/c we truncate fixed depth. script_path = logger.truncate(caller_script, depth=len(__file__.split('/')) - 1) file_stem = logger.stem(script_path) file_name = basename(file_stem) RUN(file_name=file_name, file_stem=file_stem, now=logger.now()) PREFIX = RUN.PREFIX # todo: there should be a better way to log these. # todo: we shouldn't need to log to the same directory, and the directory for the run shouldn't be fixed. logger.configure( root_dir=RUN.server, prefix=PREFIX, asynchronous=False, # use sync logger max_workers=4, register_experiment=False) if RUN.restart: with logger.Sync(): logger.remove(".") logger.upload_file(caller_script) # the tension is in between creation vs run. Code snapshot are shared, but runs need to be unique. _ = dict() if ARGS: _['args'] = ARGS if KWARGS: _['kwargs'] = KWARGS logger.log_params(run=logger.run_info(status="created", script_path=script_path), revision=logger.rev_info(), fn=logger.fn_info(fn), **_, silent=__silent) logger.print( 'taking diff, if this step takes too long, check if your ' 'uncommitted changes are too large.', color="green") logger.diff() if RUN.readme: logger.log_text(RUN.readme, "README.md", dedent=True) import jaynes # now set the job name to prefix if jaynes.RUN.config and jaynes.RUN.mode != "local": runner_class, runner_args = jaynes.RUN.config['runner'] if 'name' in runner_args: # ssh mode does not have 'name'. runner_args['name'] = pJoin(file_name, RUN.JOB_NAME) del logger, jaynes, runner_args, runner_class if not __file: cprint(f'Set up job name', "green") def thunk(*args, **kwargs): import traceback from ml_logger import logger assert not (args and ARGS), \ f"can not use position argument at both thunk creation as well as run.\n" \ f"_args: {args}\n" \ f"ARGS: {ARGS}\n" logger.configure(root_dir=RUN.server, prefix=PREFIX, register_experiment=False, max_workers=10) logger.log_params(host=dict(hostname=logger.hostname), run=dict(status="running", startTime=logger.now(), job_id=logger.job_id)) import time try: _KWARGS = {**KWARGS} _KWARGS.update(**kwargs) results = fn(*(args or ARGS), **_KWARGS) logger.log_line("========== execution is complete ==========") logger.log_params( run=dict(status="completed", completeTime=logger.now())) logger.flush() time.sleep(3) except Exception as e: tb = traceback.format_exc() with logger.SyncContext( ): # Make sure uploaded finished before termination. logger.print(tb, color="red") logger.log_text(tb, filename="traceback.err") logger.log_params( run=dict(status="error", exitTime=logger.now())) logger.flush() time.sleep(3) raise e return results return thunk
def thunk(fn, *ARGS, __prefix="", __timestamp='%H.%M/%S.%f', **KWARGS): """ thunk for configuring the logger. The reason why this is not a decorator is :param fn: function to be called :param *ARGS: position arguments for the call :param __prefix: logging prefix for this run, default to "", where it does not do much. :param __timestamp: bool, default to True, whether post-fix with time stamps. :param **KWARGS: keyword arguments for the call :return: a thunk that can be called without parameters """ from ml_logger import logger caller_script = abspath(inspect.getmodule(inspect.stack()[1][0]).__file__) # note: for scripts in the `plan2vec` module this also works -- b/c we truncate fixed depth. script_path = logger.truncate(caller_script, depth=len(__file__.split('/')) - 1) _ = [logger.now(__timestamp)] if __timestamp else [] PREFIX = join(RUN.prefix, logger.stem(script_path), __prefix, *_) # todo: there should be a better way to log these. # todo: we shouldn't need to log to the same directory, and the directory for the run shouldn't be fixed. logger.configure( log_directory=RUN.server, prefix=PREFIX, asynchronous=False, # use sync logger max_workers=4, register_experiment=False) # the tension is in between creation vs run. Code snapshot are shared, but runs need to be unique. logger.log_params( run=logger.run_info(status="created", script_path=script_path), revision=logger.rev_info(), fn=logger.fn_info(fn), ) logger.log_params(args=ARGS, kwargs=KWARGS) logger.diff(silent=True) import jaynes # now set the job name to prefix if jaynes.RUN.mode != "local": runner_class, runner_args = jaynes.RUN.config['runner'] if 'name' in runner_args: # ssh mode does not have 'name'. runner_args['name'] = PREFIX.replace("geyang/", "") # destroy my traces. del logger, jaynes, runner_args, runner_class cprint(f'{__file__}: Set up job name', "green") def _(*args, **kwargs): import traceback from ml_logger import logger assert not (args and ARGS), f"can not use position argument at both thunk creation as well as " \ f"run.\n_args: {args}\nARGS: {ARGS}" logger.configure(log_directory=RUN.server, prefix=PREFIX, register_experiment=False, max_workers=10) logger.log_params(host=dict(hostname=logger.hostname), run=dict(status="running", startTime=logger.now())) try: _KWARGS = KWARGS.copy() _KWARGS.update(kwargs) fn(*(args or ARGS), **_KWARGS) logger.log_line("========= execution is complete ==========") logger.log_params( run=dict(status="completed", completeTime=logger.now())) except Exception as e: import time time.sleep(1) tb = traceback.format_exc() with logger.SyncContext( ): # Make sure uploaded finished before termination. logger.log_text(tb, filename="traceback.err") logger.log_params( run=dict(status="error", exitTime=logger.now())) logger.log_line(tb) logger.flush() time.sleep(30) raise e import time time.sleep(30) return _