def make_config(params): config = tools.AttrDict() config.debug = params.get('debug', False) with params.unlocked: for name in params.get('defaults', ['dreamer']): for key, value in DEFAULTS[name].items(): if key not in params: params[key] = value config.loss_scales = tools.AttrDict() config = _data_processing(config, params) config = _model_components(config, params) config = _tasks(config, params) config = _loss_functions(config, params) config = _training_schedule(config, params) # Mark params as used which are only accessed at run-time. run_time_keys = [ 'planner_discount', 'planner_lambda', 'objective_entropy_scale', 'normalize_actions', 'max_length', 'render_size', 'atari_lifes', 'atari_noops', 'atari_sticky', 'atari_train_max_length', 'atari_grayscale' ] for key in run_time_keys: params.get(key, None) if params.untouched: message = 'Found unused config overrides: {}' raise KeyError(message.format(', '.join(params.untouched))) return config
def _active_collection(tasks, collects, defaults, config, params): sims = tools.AttrDict() for task in tasks: for user_collect in collects: for key in user_collect: if key not in defaults: message = 'Invalid key {} in activation collection config.' raise KeyError(message.format(key)) collect = tools.AttrDict(defaults, _unlocked=True) collect.update(user_collect) collect.planner = _define_planner(collect.planner, collect.horizon, config, params) collect.objective = tools.bind(getattr(objectives_lib, collect.objective), params=params) if collect.give_rewards: collect.task = task else: env_ctor = tools.bind( lambda ctor: control.wrappers.NoRewardHint(ctor()), task.env_ctor) collect.task = tasks_lib.Task(task.name, env_ctor) collect.exploration = tools.AttrDict( scale=collect.action_noise_scale, type=collect.action_noise_type, schedule=tools.bind(tools.schedule.linear, ramp=collect.action_noise_ramp, min=collect.action_noise_min), factors=collect.action_noise_factors) name = '{}_{}_{}'.format(collect.prefix, collect.name, task.name) assert name not in sims, (set(sims.keys()), name) sims[name] = collect return sims
def test_dm_control_thread(self): args = tools.AttrDict(logdir=self.get_temp_dir(), num_runs=1, params=tools.AttrDict( defaults=['dreamer', 'debug'], tasks=['cup_catch'], isolate_envs='thread', max_steps=30), ping_every=0, resume_runs=False) train.main(args)
def test_planet(self): args = tools.AttrDict(logdir=self.get_temp_dir(), num_runs=1, params=tools.AttrDict( defaults=['planet', 'debug'], tasks=['dummy'], isolate_envs='none', max_steps=30, planner_horizon=3), ping_every=0, resume_runs=False) train.main(args)
def test_no_value(self): args = tools.AttrDict(logdir=self.get_temp_dir(), num_runs=1, params=tools.AttrDict( defaults=['actor', 'debug'], tasks=['dummy'], isolate_envs='none', max_steps=30, imagination_horizon=3), ping_every=0, resume_runs=False) train.main(args)
def test_atari_thread(self): args = tools.AttrDict(logdir=self.get_temp_dir(), num_runs=1, params=tools.AttrDict( defaults=['dreamer', 'debug'], tasks=['atari_pong'], isolate_envs='thread', action_head_dist='onehot_score', action_noise_type='epsilon_greedy', max_steps=30), ping_every=0, resume_runs=False) train.main(args)
def test_dreamer(self): args = tools.AttrDict(logdir=self.get_temp_dir(), num_runs=1, params=tools.AttrDict( defaults=['dreamer', 'debug'], tasks=['dummy'], isolate_envs='none', max_steps=30, train_planner='policy_sample', test_planner='policy_mode', planner_objective='reward_value', action_head=True, value_head=True, imagination_horizon=3), ping_every=0, resume_runs=False) train.main(args)
def _initial_collection(config, params): num_seed_episodes = int(params.get('num_seed_episodes', 5)) num_seed_steps = int(params.get('num_seed_steps', 2500)) sims = tools.AttrDict() for task in config.train_tasks: sims['train-' + task.name] = tools.AttrDict( task=task, mode='train', save_episode_dir=config.train_dir, num_episodes=num_seed_episodes, num_steps=num_seed_steps, give_rewards=params.get('seed_episode_rewards', True)) for task in config.test_tasks: sims['test-' + task.name] = tools.AttrDict( task=task, mode='test', save_episode_dir=config.test_dir, num_episodes=num_seed_episodes, num_steps=num_seed_steps, give_rewards=True) return sims
def __init__(self, logdir, config=None): self._logdir = logdir self._global_step = tf.train.get_or_create_global_step() self._step = tf.placeholder(tf.int32, name='step') self._phase = tf.placeholder(tf.string, name='phase') self._log = tf.placeholder(tf.bool, name='log') self._report = tf.placeholder(tf.bool, name='report') self._reset = tf.placeholder(tf.bool, name='reset') self._phases = [] # Checkpointing. self._loaders = [] self._savers = [] self._logdirs = [] self._checkpoints = [] self._config = config or tools.AttrDict()
def simulate(metrics, config, params, graph, cleanups, gif_summary, name): def env_ctor(): env = params.task.env_ctor() if params.save_episode_dir: env = control.wrappers.CollectDataset(env, params.save_episode_dir) return env bind_or_none = lambda x, **kw: x and functools.partial(x, **kw) cell = graph.cell agent_config = tools.AttrDict(cell=cell, encoder=graph.encoder, planner=functools.partial(params.planner, graph=graph), objective=bind_or_none(params.objective, graph=graph), exploration=params.exploration, preprocess_fn=config.preprocess_fn, postprocess_fn=config.postprocess_fn) params = params.copy() with params.unlocked: params.update(agent_config) with agent_config.unlocked: agent_config.update(params) with tf.variable_scope(name): summaries = [] env = control.create_batch_env(env_ctor, params.num_envs, config.isolate_envs) agent = control.MPCAgent(env, graph.step, False, False, agent_config) cleanup = lambda: env.close() scores, lengths, data = control.simulate(agent, env, params.num_episodes, params.num_steps) summaries.append(tf.summary.scalar('return', scores[0])) summaries.append(tf.summary.scalar('length', lengths[0])) if gif_summary: summaries.append( tools.gif_summary('gif', data['image'], max_outputs=1, fps=20)) write_metrics = [ metrics.add_scalars(name + '/return', scores), metrics.add_scalars(name + '/length', lengths), # metrics.add_tensor(name + '/frames', data['image']), ] with tf.control_dependencies(write_metrics): summary = tf.summary.merge(summaries) cleanups.append(cleanup) # Work around tf.cond() tensor return type. return summary, tf.reduce_mean(scores)
def _training_schedule(config, params): config.train_steps = int(params.get('train_steps', 50000)) config.test_steps = int(params.get('test_steps', config.batch_shape[0])) config.max_steps = int(params.get('max_steps', 5e7)) config.train_log_every = params.get('train_log_every', config.train_steps) config.train_checkpoint_every = None config.test_checkpoint_every = int( params.get('checkpoint_every', 10 * config.test_steps)) config.checkpoint_to_load = None config.savers = [tools.AttrDict(exclude=(r'.*_temporary.*', ))] config.print_metrics_every = config.train_steps // 10 config.train_dir = os.path.join(params.logdir, 'train_episodes') config.test_dir = os.path.join(params.logdir, 'test_episodes') config.random_collects = _initial_collection(config, params) defaults = tools.AttrDict() defaults.name = 'main' defaults.give_rewards = True defaults.horizon = params.get('planner_horizon', 12) defaults.objective = params.get('planner_objective', 'reward_value') defaults.num_envs = params.get('num_envs', 1) defaults.num_episodes = params.get('collect_episodes', defaults.num_envs) defaults.num_steps = params.get('collect_steps', 500) defaults.steps_after = params.get('collect_every', 5000) defaults.steps_every = params.get('collect_every', 5000) defaults.steps_until = -1 defaults.action_noise_type = params.get('action_noise_type', 'additive_normal') train_defaults = defaults.copy(_unlocked=True) train_defaults.prefix = 'train' train_defaults.mode = 'train' train_defaults.save_episode_dir = config.train_dir train_defaults.planner = params.get('train_planner', 'policy_sample') train_defaults.objective = params.get('train_planner_objective', defaults.objective) train_defaults.action_noise_scale = params.get('train_action_noise', 0.3) train_defaults.action_noise_ramp = params.get('train_action_noise_ramp', 0) train_defaults.action_noise_min = params.get('train_action_noise_min', 0.0) train_defaults.action_noise_factors = params.get( 'train_action_noise_factors', []) config.train_collects = _active_collection( config.train_tasks, params.get('train_collects', [{}]), train_defaults, config, params) test_defaults = defaults.copy(_unlocked=True) test_defaults.prefix = 'test' test_defaults.mode = 'test' test_defaults.save_episode_dir = config.test_dir test_defaults.planner = params.get('test_planner', 'policy_mode') test_defaults.objective = params.get('test_planner_objective', defaults.objective) test_defaults.action_noise_scale = params.get('test_action_noise', 0.0) test_defaults.action_noise_ramp = 0 test_defaults.action_noise_min = 0.0 test_defaults.action_noise_factors = params.get( 'train_action_noise_factors', None) config.test_collects = _active_collection( config.test_tasks, params.get('test_collects', [{}]), test_defaults, config, params) return config
def _loss_functions(config, params): for head in config.gradient_heads: assert head in config.heads, head config.imagination_horizon = params.get('imagination_horizon', 15) config.imagination_skip_last = params.get('imagination_skip_last', None) config.imagination_include_initial = params.get( 'imagination_include_initial', True) config.action_source = params.get('action_source', 'model') config.action_model_horizon = params.get('action_model_horizon', None) config.action_bootstrap = params.get('action_bootstrap', True) config.action_discount = params.get('action_discount', 0.99) config.action_lambda = params.get('action_lambda', 0.95) config.action_target_update = params.get('action_target_update', 1) config.action_target_period = params.get('action_target_period', 50000) config.action_loss_pcont = params.get('action_loss_pcont', False) config.action_pcont_stop_grad = params.get('action_pcont_stop_grad', False) config.action_pcont_weight = params.get('action_pcont_weight', True) config.value_source = params.get('value_source', 'model') config.value_model_horizon = params.get('value_model_horizon', None) config.value_discount = params.get('value_discount', 0.99) config.value_lambda = params.get('value_lambda', 0.95) config.value_bootstrap = params.get('value_bootstrap', True) config.value_target_update = params.get('value_target_update', 1) config.value_target_period = params.get('value_target_period', 50000) config.value_loss_pcont = params.get('value_loss_pcont', False) config.value_pcont_weight = params.get('value_pcont_weight', True) config.value_maxent = params.get('value_maxent', False) config.action_beta = params.get('action_beta', 0.0) config.action_beta_dims_value = params.get('action_beta_dims_value', None) config.state_beta = params.get('state_beta', 0.0) config.stop_grad_pre_action = params.get('stop_grad_pre_action', True) config.pcont_label_weight = params.get('pcont_label_weight', None) config.loss_scales.divergence = params.get('divergence_scale', 1.0) config.loss_scales.global_divergence = params.get('global_div_scale', 0.0) config.loss_scales.overshooting = params.get('overshooting_scale', 0.0) for head in config.heads: if head in ('value_target', 'action_target'): # Untrained. continue config.loss_scales[head] = params.get(head + '_loss_scale', 1.0) config.free_nats = params.get('free_nats', 3.0) config.overshooting_distance = params.get('overshooting_distance', 0) config.os_stop_posterior_grad = params.get('os_stop_posterior_grad', True) config.cpc_contrast = params.get('cpc_contrast', 'window') config.cpc_batch_amount = params.get('cpc_batch_amount', 10) config.cpc_time_amount = params.get('cpc_time_amount', 30) optimizer_cls = tools.bind(tf.train.AdamOptimizer, epsilon=params.get('optimizer_epsilon', 1e-4)) config.optimizers = tools.AttrDict() config.optimizers.default = tools.bind( tools.CustomOptimizer, optimizer_cls=optimizer_cls, # schedule=tools.bind(tools.schedule.linear, ramp=0), learning_rate=params.get('default_lr', 1e-3), clipping=params.get('default_gradient_clipping', 1000.0)) config.optimizers.model = config.optimizers.default.copy( learning_rate=params.get('model_lr', 6e-4), clipping=params.get('model_gradient_clipping', 100.0)) config.optimizers.value = config.optimizers.default.copy( learning_rate=params.get('value_lr', 8e-5), clipping=params.get('value_gradient_clipping', 100.0)) config.optimizers.action = config.optimizers.default.copy( learning_rate=params.get('action_lr', 8e-5), clipping=params.get('action_gradient_clipping', 100.0)) return config
def _model_components(config, params): config.gradient_heads = params.get('gradient_heads', ['image', 'reward']) config.activation = ACTIVATIONS[params.get('activation', 'elu')] config.num_layers = params.get('num_layers', 3) config.num_units = params.get('num_units', 400) encoder = params.get('encoder', 'conv') if encoder == 'conv': config.encoder = networks.conv.encoder elif encoder == 'proprio': config.encoder = tools.bind(networks.proprio.encoder, keys=params.get('proprio_encoder_keys'), num_layers=params.get( 'proprio_encoder_num_layers', 3), units=params.get('proprio_encoder_units', 300)) else: raise NotImplementedError(encoder) config.head_network = tools.bind(networks.feed_forward, num_layers=config.num_layers, units=config.num_units, activation=config.activation) config.heads = tools.AttrDict() if params.get('value_head', True): config.heads.value = tools.bind( config.head_network, num_layers=params.get('value_layers', 3), data_shape=[], dist=params.get('value_dist', 'normal')) if params.get('value_target_head', False): config.heads.value_target = tools.bind( config.head_network, num_layers=params.get('value_layers', 3), data_shape=[], stop_gradient=True, dist=params.get('value_dist', 'normal')) if params.get('return_head', False): config.heads['return'] = tools.bind(config.head_network, activation=config.activation) if params.get('action_head', True): config.heads.action = tools.bind( config.head_network, num_layers=params.get('action_layers', 4), mean_activation=ACTIVATIONS[params.get('action_mean_activation', 'none')], dist=params.get('action_head_dist', 'tanh_normal_tanh'), std=params.get('action_head_std', 'learned'), min_std=params.get('action_head_min_std', 1e-4), init_std=params.get('action_head_init_std', 5.0)) if params.get('action_target_head', False): config.heads.action_target = tools.bind( config.head_network, num_layers=params.get('action_layers', 4), stop_gradient=True, mean_activation=ACTIVATIONS[params.get('action_mean_activation', 'none')], dist=params.get('action_head_dist', 'tanh_normal_tanh'), std=params.get('action_head_std', 'learned'), min_std=params.get('action_head_min_std', 1e-4), init_std=params.get('action_head_init_std', 5.0)) if params.get('cpc_head', False): config.heads.cpc = config.head_network.copy( dist=params.get('cpc_head_dist', 'normal'), std=params.get('cpc_head_std', 'learned'), num_layers=params.get('cpc_head_layers', 3)) image_head = params.get('image_head', 'conv') if image_head == 'conv': config.heads.image = tools.bind(networks.conv.decoder, std=params.get('image_head_std', 1.0)) else: raise NotImplementedError(image_head) hidden_size = params.get('model_size', 200) state_size = params.get('state_size', 30) model = params.get('model', 'rssm') if model == 'rssm': config.cell = tools.bind(models.RSSM, state_size, hidden_size, hidden_size, params.get('future_rnn', True), params.get('mean_only', False), params.get('min_stddev', 1e-1), config.activation, params.get('model_layers', 1), params.get('rssm_model', 'gru'), params.get('trxl_layer', 2), params.get('trxl_n_head', 10), params.get('trxl_mem_len', 8), params.get('trxl_pre_lnorm', False), params.get('trxl_gate', 'plus')) else: raise NotImplementedError(model) return config
def define_model(logdir, metrics, data, trainer, config): print('Build TensorFlow compute graph.') dependencies = [] cleanups = [] step = trainer.step global_step = trainer.global_step phase = trainer.phase timestamp = tf.py_func( lambda: datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%S'), [], tf.string) dependencies.append( metrics.set_tags(global_step=global_step, step=step, phase=phase, time=timestamp)) # Instantiate network blocks. Note, this initialization would be expensive # when using tf.function since it would run at every step. try: cell = config.cell() except TypeError: cell = config.cell(action_size=data['action'].shape[-1].value) kwargs = dict(create_scope_now_=True) encoder = tf.make_template('encoder', config.encoder, **kwargs) heads = tools.AttrDict(_unlocked=True) raw_dummy_features = cell.features_from_state( cell.zero_state(1, tf.float32))[:, None] for key, head in config.heads.items(): name = 'head_{}'.format(key) kwargs = dict(create_scope_now_=True) if key in data: kwargs['data_shape'] = data[key].shape[2:].as_list() if key == 'action_target': kwargs['data_shape'] = data['action'].shape[2:].as_list() if key == 'cpc': kwargs['data_shape'] = [cell.feature_size] dummy_features = encoder(data)[:1, :1] else: dummy_features = raw_dummy_features heads[key] = tf.make_template(name, head, **kwargs) heads[key](dummy_features) # Initialize weights. # Update target networks. if 'value_target' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_value/.*', r'.*/head_value_target/.*', config.value_target_period, config.value_target_update)) if 'value_target_2' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_value/.*', r'.*/head_value_target_2/.*', config.value_target_period, config.value_target_update)) if 'action_target' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_action/.*', r'.*/head_action_target/.*', config.action_target_period, config.action_target_update)) # Apply and optimize model. embedded = encoder(data) with tf.control_dependencies(dependencies): embedded = tf.identity(embedded) graph = tools.AttrDict(locals()) prior, posterior = tools.unroll.closed_loop(cell, embedded, data['action'], config.debug) objectives = utility.compute_objectives(posterior, prior, data, graph, config) summaries, grad_norms = utility.apply_optimizers(objectives, trainer, config) dependencies += summaries # Active data collection. with tf.variable_scope('collection'): with tf.control_dependencies( dependencies): # Make sure to train first. for name, params in config.train_collects.items(): schedule = tools.schedule.binary(step, config.batch_shape[0], params.steps_after, params.steps_every, params.steps_until) summary, _ = tf.cond(tf.logical_and( tf.equal(trainer.phase, 'train'), schedule), functools.partial(utility.simulate, metrics, config, params, graph, cleanups, gif_summary=False, name=name), lambda: (tf.constant(''), tf.constant(0.0)), name='should_collect_' + name) summaries.append(summary) dependencies.append(summary) # Compute summaries. graph = tools.AttrDict(locals()) summary, score = tf.cond( trainer.log, lambda: define_summaries.define_summaries(graph, config, cleanups), lambda: (tf.constant(''), tf.zeros((0, ), tf.float32)), name='summaries') summaries = tf.summary.merge([summaries, summary]) dependencies.append( utility.print_metrics({ob.name: ob.value for ob in objectives}, step, config.print_metrics_every, 2, 'objectives')) dependencies.append( utility.print_metrics(grad_norms, step, config.print_metrics_every, 2, 'grad_norms')) dependencies.append(tf.cond(trainer.log, metrics.flush, tf.no_op)) with tf.control_dependencies(dependencies): score = tf.identity(score) return score, summaries, cleanups
experiment = training.Experiment(args.logdir, process_fn=functools.partial(process, args=args), num_runs=args.num_runs, ping_every=args.ping_every, resume_runs=args.resume_runs) for run in experiment: for unused_score in run: pass if __name__ == '__main__': boolean = lambda x: bool(['False', 'True'].index(x)) parser = argparse.ArgumentParser() parser.add_argument('--logdir', type=pathlib.Path, required=True) parser.add_argument('--params', default='{}') parser.add_argument('--num_runs', type=int, default=1) parser.add_argument('--ping_every', type=int, default=0) parser.add_argument('--resume_runs', type=boolean, default=True) parser.add_argument('--dmlab_runfiles_path', default=None) args_, remaining = parser.parse_known_args() params_ = args_.params.replace('#', ',').replace('\\', '') args_.params = tools.AttrDict(yaml.safe_load(params_)) if args_.dmlab_runfiles_path: with args_.params.unlocked: args_.params.dmlab_runfiles_path = args_.dmlab_runfiles_path assert args_.params.dmlab_runfiles_path # Mark as accessed. args_.logdir = args_.logdir and os.path.expanduser(args_.logdir) remaining.insert(0, sys.argv[0]) tf.app.run(lambda _: main(args_), remaining)