def _make_context(self, *args, **kwargs): """Make a context from the template information and variant args. Currently, all arguments should be keyword arguments. Args: args (list): Should be empty. kwargs (dict): Keyword arguments for the wrapped function. Will be logged to `variant.json` Returns: ExperimentContext: The created experiment context. Raises: ValueError: If args is not empty. """ if args: raise ValueError('garage.experiment currently only supports ' 'keyword arguments') name = self.name if name is None: name = self.function.__name__ if self.name_parameters: name = self._augment_name(name, kwargs) log_dir = self.log_dir if log_dir is None: log_dir = ('{data}/local/{prefix}/{name}'.format( data=os.path.join(os.getcwd(), 'data'), prefix=self.prefix, name=name)) log_dir = _make_sequential_log_dir(log_dir) tabular_log_file = os.path.join(log_dir, 'progress.csv') text_log_file = os.path.join(log_dir, 'debug.log') variant_log_file = os.path.join(log_dir, 'variant.json') metadata_log_file = os.path.join(log_dir, 'metadata.json') dump_json(variant_log_file, kwargs) git_root_path, metadata = get_metadata() dump_json(metadata_log_file, metadata) if git_root_path and self.archive_launch_repo: make_launcher_archive(git_root_path=git_root_path, log_dir=log_dir) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output( dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) logger.add_output(dowel.StdOutput()) logger.push_prefix('[{}] '.format(name)) logger.log('Logging to {}'.format(log_dir)) return ExperimentContext(snapshot_dir=log_dir, snapshot_mode=self.snapshot_mode, snapshot_gap=self.snapshot_gap)
def __enter__(self): tabular_log_file = os.path.join(self.log_dir, 'progress.csv') text_log_file = os.path.join(self.log_dir, 'debug.log') logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(self.log_dir)) logger.add_output(dowel.StdOutput()) logger.push_prefix('[%s] ' % self.exp_name) return self
def _make_context(cls, options, **kwargs): """Make a context from the template information and variant args. Currently, all arguments should be keyword arguments. Args: options (dict): Options to `wrap_experiment` itself. See the function documentation for details. kwargs (dict): Keyword arguments for the wrapped function. Will be logged to `variant.json` Returns: ExperimentContext: The created experiment context. """ name = options['name'] if name is None: name = options['function'].__name__ name = cls._augment_name(options, name, kwargs) log_dir = options['log_dir'] if log_dir is None: log_dir = ('{data}/local/{prefix}/{name}'.format( data=os.path.join(os.getcwd(), 'data'), prefix=options['prefix'], name=name)) if options['use_existing_dir']: os.makedirs(log_dir, exist_ok=True) else: log_dir = _make_sequential_log_dir(log_dir) tabular_log_file = os.path.join(log_dir, 'progress.csv') text_log_file = os.path.join(log_dir, 'debug.log') variant_log_file = os.path.join(log_dir, 'variant.json') metadata_log_file = os.path.join(log_dir, 'metadata.json') dump_json(variant_log_file, kwargs) git_root_path, metadata = get_metadata() dump_json(metadata_log_file, metadata) if git_root_path and options['archive_launch_repo']: make_launcher_archive(git_root_path=git_root_path, log_dir=log_dir) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output( dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) logger.add_output(dowel.StdOutput()) logger.push_prefix('[{}] '.format(name)) logger.log('Logging to {}'.format(log_dir)) return ExperimentContext(snapshot_dir=log_dir, snapshot_mode=options['snapshot_mode'], snapshot_gap=options['snapshot_gap'])
def restore_training(log_dir, exp_name, args, env_saved=True, env=None): tabular_log_file = os.path.join( log_dir, 'progress_restored.{}.{}.csv'.format( str(time.time())[:10], socket.gethostname())) text_log_file = os.path.join( log_dir, 'debug_restored.{}.{}.log'.format( str(time.time())[:10], socket.gethostname())) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) logger.add_output(dowel.StdOutput()) logger.push_prefix('[%s] ' % exp_name) ctxt = ExperimentContext(snapshot_dir=log_dir, snapshot_mode='last', snapshot_gap=1) runner = LocalRunnerWrapper(ctxt, eval=args.eval_during_training, n_eval_episodes=args.n_eval_episodes, eval_greedy=args.eval_greedy, eval_epoch_freq=args.eval_epoch_freq, save_env=env_saved) saved = runner._snapshotter.load(log_dir, 'last') runner._setup_args = saved['setup_args'] runner._train_args = saved['train_args'] runner._stats = saved['stats'] set_seed(runner._setup_args.seed) algo = saved['algo'] # Compatibility patch if not hasattr(algo, '_clip_grad_norm'): setattr(algo, '_clip_grad_norm', args.clip_grad_norm) if env_saved: env = saved['env'] runner.setup(env=env, algo=algo, sampler_cls=runner._setup_args.sampler_cls, sampler_args=runner._setup_args.sampler_args) runner._train_args.start_epoch = runner._stats.total_epoch + 1 runner._train_args.n_epochs = runner._train_args.start_epoch + args.n_epochs print('\nRestored checkpoint from epoch #{}...'.format( runner._train_args.start_epoch)) print('To be trained for additional {} epochs...'.format(args.n_epochs)) print('Will be finished at epoch #{}...\n'.format( runner._train_args.n_epochs)) return runner._algo.train(runner)
def _make_context(self, *args, **kwargs): """Make a context from the template information and variant args. Currently, all arguments should be keyword arguments. Args: args (list): Should be empty. kwargs (dict): Keyword arguments for the wrapped function. Will be logged to `variant.json` Returns: ExperimentContext: The created experiment context. Raises: ValueError: If args is not empty. """ if args: raise ValueError('metarl.experiment currently only supports ' 'keyword arguments') log_dir = self.log_dir if log_dir is None: name = self.name if name is None: name = self.function.__name__ self.name = self.function.__name__ log_dir = ('{data}/local/{prefix}/{name}/{time}'.format( data=osp.join(os.getcwd(), 'data'), prefix=self.prefix, name=name, time=timestamp)) log_dir = _make_sequential_log_dir(log_dir) tabular_log_file = os.path.join(log_dir, 'progress.csv') text_log_file = os.path.join(log_dir, 'debug.log') variant_log_file = os.path.join(log_dir, 'variant.json') dump_json(variant_log_file, kwargs) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output( dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) logger.add_output(dowel.StdOutput()) logger.push_prefix('[%s] ' % self.name) return ExperimentContext(snapshot_dir=log_dir, snapshot_mode=self.snapshot_mode, snapshot_gap=self.snapshot_gap)
def run_experiment(argv): """Run experiment. Args: argv (list[str]): Command line arguments. Raises: BaseException: Propagate any exception in the experiment. """ now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--n_parallel', type=int, default=1, help=('Number of parallel workers to perform rollouts. ' "0 => don't start any workers")) parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='last', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument('--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument( '--resume_from_dir', type=str, default=None, help='Directory of the pickle file to resume experiment from.') parser.add_argument('--resume_from_epoch', type=str, default=None, help='Index of iteration to restore from. ' 'Can be "first", "last" or a number. ' 'Not applicable when snapshot_mode="last"') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--tensorboard_step_key', type=str, default=None, help='Name of the step key in tensorboard_summary.') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Print only the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, default=None, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for objects') parser.add_argument('--variant_data', type=str, help='Pickled data for variant configuration') args = parser.parse_args(argv[1:]) if args.seed is not None: deterministic.set_seed(args.seed) if args.n_parallel > 0: parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) if not args.plot: garage.plotter.Plotter.disable() garage.tf.plotter.Plotter.disable() if args.log_dir is None: log_dir = os.path.join(os.path.join(os.getcwd(), 'data'), args.exp_name) else: log_dir = args.log_dir tabular_log_file = os.path.join(log_dir, args.tabular_log_file) text_log_file = os.path.join(log_dir, args.text_log_file) params_log_file = os.path.join(log_dir, args.params_log_file) if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = os.path.join(log_dir, args.variant_log_file) dump_variant(variant_log_file, variant_data) else: variant_data = None log_parameters(params_log_file, args) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) logger.add_output(dowel.StdOutput()) logger.push_prefix('[%s] ' % args.exp_name) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode=args.snapshot_mode, snapshot_gap=args.snapshot_gap) method_call = cloudpickle.loads(base64.b64decode(args.args_data)) try: method_call(snapshot_config, variant_data, args.resume_from_dir, args.resume_from_epoch) except BaseException: children = garage.plotter.Plotter.get_plotters() children += garage.tf.plotter.Plotter.get_plotters() if args.n_parallel > 0: children += [parallel_sampler] child_proc_shutdown(children) raise logger.remove_all() logger.pop_prefix()
This example demonstrates how to log a simple progress metric using dowel. The metric is simultaneously sent to the screen, a CSV files, a text log file and TensorBoard. """ import time import dowel from dowel import logger, tabular logger.add_output(dowel.StdOutput()) logger.add_output(dowel.CsvOutput('progress.csv')) logger.add_output(dowel.TextOutput('progress.txt')) logger.add_output(dowel.TensorBoardOutput('tensorboard_logdir')) logger.log('Starting up...') for i in range(1000): logger.push_prefix('itr {}: '.format(i)) logger.log('Running training step') time.sleep(0.01) # Tensorboard doesn't like output to be too fast. tabular.record('itr', i) tabular.record('loss', 100.0 / (2 + i)) logger.log(tabular) logger.pop_prefix() logger.dump_all() logger.remove_all()
def run_experiment(argv): now = datetime.datetime.now(dateutil.tz.tzlocal()) # avoid name clashes when running distributed jobs rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument( '--n_parallel', type=int, default=1, help=('Number of parallel workers to perform rollouts. ' "0 => don't start any workers")) parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--log_dir', type=str, default=None, help='Path to save the log and iteration snapshot.') parser.add_argument('--snapshot_mode', type=str, default='last', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), "gap" (every' '`snapshot_gap` iterations are saved), or "none" ' '(do not save snapshots)') parser.add_argument('--snapshot_gap', type=int, default=1, help='Gap between snapshot iterations.') parser.add_argument( '--resume_from_dir', type=str, default=None, help='Directory of the pickle file to resume experiment from.') parser.add_argument('--resume_from_epoch', type=str, default=None, help='Index of iteration to restore from. ' 'Can be "first", "last" or a number. ' 'Not applicable when snapshot_mode="last"') parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--tensorboard_step_key', type=str, default=None, help='Name of the step key in tensorboard_summary.') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--variant_log_file', type=str, default='variant.json', help='Name of the variant log file (in json).') parser.add_argument('--plot', type=ast.literal_eval, default=False, help='Whether to plot the iteration results') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Print only the tabular log information (in a horizontal format)') parser.add_argument('--seed', type=int, default=None, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for objects') parser.add_argument('--variant_data', type=str, help='Pickled data for variant configuration') parser.add_argument('--use_cloudpickle', type=ast.literal_eval, default=False) args = parser.parse_args(argv[1:]) if args.seed is not None: deterministic.set_seed(args.seed) # SIGINT is blocked for all processes created in parallel_sampler to avoid # the creation of sleeping and zombie processes. # # If the user interrupts run_experiment, there's a chance some processes # won't die due to a dead lock condition where one of the children in the # parallel sampler exits without releasing a lock once after it catches # SIGINT. # # Later the parent tries to acquire the same lock to proceed with his # cleanup, but it remains sleeping waiting for the lock to be released. # In the meantime, all the process in parallel sampler remain in the zombie # state since the parent cannot proceed with their clean up. with mask_signals([signal.SIGINT]): if args.n_parallel > 0: parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: parallel_sampler.set_seed(args.seed) if not args.plot: garage.plotter.Plotter.disable() garage.tf.plotter.Plotter.disable() if args.log_dir is None: log_dir = os.path.join(os.path.join(os.getcwd(), 'data'), args.exp_name) else: log_dir = args.log_dir tabular_log_file = os.path.join(log_dir, args.tabular_log_file) text_log_file = os.path.join(log_dir, args.text_log_file) params_log_file = os.path.join(log_dir, args.params_log_file) if args.variant_data is not None: variant_data = pickle.loads(base64.b64decode(args.variant_data)) variant_log_file = os.path.join(log_dir, args.variant_log_file) dump_variant(variant_log_file, variant_data) else: variant_data = None if not args.use_cloudpickle: log_parameters(params_log_file, args) logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output(dowel.TensorBoardOutput(log_dir)) logger.add_output(dowel.StdOutput()) logger.push_prefix('[%s] ' % args.exp_name) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode=args.snapshot_mode, snapshot_gap=args.snapshot_gap) # read from stdin if args.use_cloudpickle: import cloudpickle method_call = cloudpickle.loads(base64.b64decode(args.args_data)) try: method_call(snapshot_config, variant_data, args.resume_from_dir, args.resume_from_epoch) except BaseException: children = garage.plotter.Plotter.get_plotters() children += garage.tf.plotter.Plotter.get_plotters() if args.n_parallel > 0: children += [parallel_sampler] child_proc_shutdown(children) raise else: data = pickle.loads(base64.b64decode(args.args_data)) maybe_iter = concretize(data) if is_iterable(maybe_iter): for _ in maybe_iter: pass logger.remove_all() logger.pop_prefix()