def imagine_forward( initial_state, distance, graph, config, policy, stop_grad_post_action=True, stop_grad_pre_action=True, return_actions=False): extended_batch = np.prod(tools.shape( tools.nested.flatten(initial_state)[0])[:2]) obs = tf.zeros([extended_batch] + list(graph.embedded.shape[2:])) use_obs = tf.zeros([extended_batch, 1], tf.bool) new_shape = lambda t: [ tf.reduce_prod(tools.shape(t)[:2])] + tools.shape(t)[2:] initial_state = tools.nested.map( lambda tensor: tf.reshape(tensor, new_shape(tensor)), initial_state) def step_fn(state_action, index): prev = state_action[0] feature = graph.cell.features_from_state(prev) if stop_grad_pre_action: feature = tf.stop_gradient(feature) action = policy(feature).sample() if stop_grad_post_action: action = tf.stop_gradient(action) (_, state), _ = graph.cell((obs, action, use_obs), prev) return [state, action] action_shape = graph.data['action'].shape dummy_action = tf.zeros([int(action_shape[0]*action_shape[1]), action_shape[2]], dtype=tf.float32) res = tf.scan(step_fn, tf.range(distance), [initial_state, dummy_action], back_prop=True) states, actions = res[0], res[1] states = tools.nested.map(lambda x: tf.transpose(x, [1, 0, 2]), states) actions = tools.nested.map(lambda x: tf.transpose(x, [1, 0, 2]), actions) if return_actions: return states, actions else: return states
def one_step_model(state, prev_action, data_shape, model_width_factor, max_objective=False, dist='deterministic'): num_layers = 2 activation = tf.nn.relu units = data_shape[0] * model_width_factor state = tf.stop_gradient(state) prev_action = tf.stop_gradient(prev_action) inputs = tf.concat([state, prev_action], -1) for _ in range(num_layers): hidden = tf.layers.dense(inputs, units, activation) inputs = tf.concat([hidden, prev_action], -1) mean = tf.layers.dense(inputs, int(np.prod(data_shape)), None) mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) if max_objective: min_std = 1e-2 init_std = 1.0 std = tf.layers.dense(inputs, int(np.prod(data_shape)), None) init_std = np.log(np.exp(init_std) - 1) std = tf.nn.softplus(std + init_std) + min_std std = tf.reshape(std, tools.shape(state)[:-1] + data_shape) dist = tfd.Normal(mean, std) dist = tfd.Independent(dist, len(data_shape)) else: dist = tfd.Deterministic(mean) dist = tfd.Independent(dist, len(data_shape)) return dist
def feed_forward( features, data_shape, num_layers=2, activation=tf.nn.relu, mean_activation=None, stop_gradient=False, trainable=True, units=100, std=1.0, low=-1.0, high=1.0, dist='normal', min_std=1e-2, init_std=1.0): hidden = features if stop_gradient: hidden = tf.stop_gradient(hidden) for _ in range(num_layers): hidden = tf.layers.dense(hidden, units, activation, trainable=trainable) mean = tf.layers.dense( hidden, int(np.prod(data_shape)), mean_activation, trainable=trainable) mean = tf.reshape(mean, tools.shape(features)[:-1] + data_shape) if std == 'learned': std = tf.layers.dense( hidden, int(np.prod(data_shape)), None, trainable=trainable) init_std = np.log(np.exp(init_std) - 1) std = tf.nn.softplus(std + init_std) + min_std std = tf.reshape(std, tools.shape(features)[:-1] + data_shape) if dist == 'normal': dist = tfd.Normal(mean, std) dist = tfd.Independent(dist, len(data_shape)) elif dist == 'deterministic': dist = tfd.Deterministic(mean) dist = tfd.Independent(dist, len(data_shape)) elif dist == 'binary': dist = tfd.Bernoulli(mean) dist = tfd.Independent(dist, len(data_shape)) elif dist == 'trunc_normal': # https://www.desmos.com/calculator/rnksmhtgui dist = tfd.TruncatedNormal(mean, std, low, high) dist = tfd.Independent(dist, len(data_shape)) elif dist == 'tanh_normal': # https://www.desmos.com/calculator/794s8kf0es dist = distributions.TanhNormal(mean, std) elif dist == 'tanh_normal_tanh': # https://www.desmos.com/calculator/794s8kf0es mean = 5.0 * tf.tanh(mean / 5.0) dist = distributions.TanhNormal(mean, std) elif dist == 'onehot_score': dist = distributions.OneHot(mean, gradient='score') elif dist == 'onehot_straight': dist = distributions.OneHot(mean, gradient='straight') else: raise NotImplementedError(dist) return dist
def decoder(features, data_shape, std=1.0): kwargs = dict(strides=2, activation=tf.nn.relu) hidden = tf.layers.dense(features, 1024, None) hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1].value]) hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs) hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs) hidden = tf.layers.conv2d_transpose(hidden, 32, 6, **kwargs) mean = tf.layers.conv2d_transpose(hidden, data_shape[-1], 6, strides=2) assert mean.shape[1:].as_list() == data_shape, mean.shape mean = tf.reshape(mean, tools.shape(features)[:-1] + data_shape) return tfd.Independent(tfd.Normal(mean, std), len(data_shape))
def cross_entropy_method( cell, objective, state, obs_shape, action_shape, horizon, graph, beams=1000, topk=100, iterations=10, min_action=-1, max_action=1): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map(lambda tensor: tf.tile( tensor, [beams] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) def iteration(index, mean, stddev): # Sample action proposals from belief. normal = tf.random_normal((batch, beams, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape( action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn( cell, (0 * obs, action, use_obs), initial_state=initial_state) return_ = objective(state) return_ = tf.reshape(return_, (batch, beams)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(batch)[:, None] * beams best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return index + 1, mean, stddev mean = tf.zeros((batch, horizon) + action_shape) stddev = tf.ones((batch, horizon) + action_shape) _, mean, std = tf.while_loop( lambda index, mean, stddev: index < iterations, iteration, (0, mean, stddev), back_prop=False) return mean
def encoder(obs, encoder_feature_shape): sh = 128 if encoder_feature_shape==512 else 256 kwargs = dict(strides=2, activation=tf.nn.relu) hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list()) hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs) hidden = tf.layers.conv2d(hidden, sh, 4, **kwargs) hidden = tf.layers.flatten(hidden) if encoder_feature_shape!=512: assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list() else: assert hidden.shape[1:].as_list() == [512], hidden.shape.as_list() hidden = tf.reshape(hidden, tools.shape(obs['image'])[:2] + [ np.prod(hidden.shape[1:].as_list())]) return hidden
def action_head_policy( cell, objective, state, obs_shape, action_shape, graph, config, strategy, min_action=-1, max_action=1): features = cell.features_from_state(state) policy = graph.heads.action(features) if strategy == 'sample': action = policy.sample() elif strategy == 'mode': action = policy.mode() elif strategy == 'curious_sample': curious_policy = graph.heads.curious_action(features) action = curious_policy.sample() elif strategy == 'random_sample': batch = tools.shape(tools.nested.flatten(features)[0])[0] mean = tf.zeros((batch,action_shape[0])) stddev = tf.ones((batch,action_shape[0])) normal = tf.random_normal((batch,action_shape[0])) action = normal * stddev + mean action = tf.clip_by_value(action, min_action, max_action) else: raise NotImplementedError(strategy) plan = action[:, None, :] return plan
def compute_objectives(posterior, prior, target, graph, config): raw_features = graph.cell.features_from_state(posterior) heads = graph.heads sample_with_replacement = None if (config.curious_run and config.bootstrap) or (config.combination_run and config.bootstrap): bagging_size = int(1*config.batch_shape[0]) sample_with_replacement = tf.random.uniform([config.num_models, bagging_size], minval=0, maxval=config.batch_shape[0], dtype = tf.dtypes.int32) if config.imagination_horizon: imagination_start = posterior if config.imagination_skip_last: imagination_start = tools.nested.map( lambda x: x[:, :-config.imagination_skip_last], imagination_start) if config.curious_run or config.vanilla_curious_run: curious_raw_states, curious_actions = imagine_forward( imagination_start, config.exploration_imagination_horizon, graph, config, graph.heads.curious_action, stop_grad_post_action=False, stop_grad_pre_action=config.stop_grad_pre_action, return_actions=True) raw_states, raw_actions = imagine_forward( imagination_start, config.imagination_horizon, graph, config, graph.heads.action, stop_grad_post_action=False, stop_grad_pre_action=config.stop_grad_pre_action, return_actions=True) else: raw_states = None objectives = [] for name, scale in sorted(config.loss_scales.items(), key=lambda x: x[0]): if config.loss_scales[name] == 0.0: continue if name in config.heads and name not in config.gradient_heads: features = tf.stop_gradient(raw_features) include = r'.*/head_{}/.*'.format(name) exclude = None else: features = raw_features include = None exclude = None if name == 'divergence': loss = graph.cell.divergence_from_states(posterior, prior) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append(Objective('divergence', loss, min, include, exclude)) elif name == 'cpc': pred = heads.cpc(graph.embedded) objective = compute_cpc_loss(pred, features, config) objectives.append(Objective('cpc', objective, max, include, exclude)) elif name == 'overshooting': shape = tools.shape(graph.data['action']) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) _, priors, posteriors, mask = tools.overshooting( graph.cell, {}, graph.embedded, graph.data['action'], length, config.overshooting_distance, posterior) posteriors, priors, mask = tools.nested.map( lambda x: x[:, :, 1:-1], (posteriors, priors, mask)) if config.os_stop_posterior_grad: posteriors = tools.nested.map(tf.stop_gradient, posteriors) loss = graph.cell.divergence_from_states(posteriors, priors) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append(Objective('overshooting', loss, min, include, exclude)) elif name == 'value': if config.value_source == 'dataset': loss = compute_value_loss( config, graph, priors, features, target['reward']) elif config.value_source == 'model': if 'action_target' in graph.heads or not config.imagination_horizon: if 'action_target' in graph.heads: policy = graph.heads.action_target else: policy = graph.heads.action states = imagine_forward( posterior, config.value_model_horizon, graph, config, policy) else: states = raw_states feat = graph.cell.features_from_state(states) if config.combination_run: loss = compute_curious_value_loss(config, graph, states, feat, None, actions=raw_actions) else: loss = compute_value_loss(config, graph, states, feat, None) else: raise NotImplementedError(config.value_source) objectives.append(Objective('value', loss, min, include, exclude)) elif name == 'curious_value': states = curious_raw_states feat = graph.cell.features_from_state(states) loss = compute_curious_value_loss(config, graph, states, feat, None, actions=curious_actions) objectives.append(Objective('curious_value', loss, min, include, exclude)) elif name == 'action': if config.action_source == 'model': if not config.imagination_horizon: states = imagine_forward( posterior, config.action_model_horizon, graph, config, policy=graph.heads.action, stop_grad_post_action=False) else: states = raw_states feat = graph.cell.features_from_state(states) if config.combination_run: objective = compute_curious_action_values(config, graph, states, feat, actions=raw_actions) else: objective = compute_action_values(config, graph, states, feat) objectives.append(Objective( 'action', objective, max, include, exclude)) elif config.action_source == 'dataset': objective = heads.action(features).log_prob(target[name]) objective -= compute_action_divergence(features, graph, config) objectives.append(Objective( 'action', objective, max, include, exclude)) else: raise NotImplementedError(config.action_source) elif name == 'curious_action': states = curious_raw_states feat = graph.cell.features_from_state(states) objective = compute_curious_action_values(config, graph, states, feat, actions=curious_actions) objectives.append(Objective( 'curious_action', objective, max, include, exclude)) elif name == 'reward': if config.curious_run and config.freeze_extrinsic_heads: if config.adaptation: features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'), graph.global_step < tf.cast(config.adaptation_step, tf.int64)), lambda: tf.stop_gradient(features), lambda: features) else: features = tf.stop_gradient(features) elif config.random_run and config.freeze_extrinsic_heads: if config.adaptation: features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'), graph.global_step < tf.cast(config.adaptation_step, tf.int64)), lambda: tf.stop_gradient(features), lambda: features) else: features = tf.stop_gradient(features) elif config.vanilla_curious_run and config.freeze_extrinsic_heads: if config.adaptation: features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'), graph.global_step < tf.cast(config.adaptation_step, tf.int64)), lambda: tf.stop_gradient(features), lambda: features) else: features = tf.stop_gradient(features) if config.combination_run: intrinsic_target = compute_intrinsic_reward(config, graph, posterior, target['action'], features) final_target = tf.math.scalar_mul(config.extrinsic_coeff, target[name]) + tf.math.scalar_mul(config.intrinsic_coeff, intrinsic_target) else: final_target = target[name] reward_mask = tf.squeeze(target['reward_mask'], [-1]) logprob = heads.reward(features).log_prob(final_target) * reward_mask objectives.append(Objective('reward', logprob, max, include, exclude)) elif name == 'pcont' and config.pcont_label_weight: terminal = tf.cast(tf.less(target[name], 0.5), tf.float32) logprob = heads[name](features).log_prob(target[name]) logprob *= 1 + terminal * (config.pcont_label_weight - 1) objectives.append(Objective(name, logprob, max, include, exclude)) elif 'one_step_model' in name: mdl = int(name[-1]) model_types = {"modeltype_1": modeltype_1, "modeltype_2": modeltype_2, "modeltype_3": modeltype_3, "modeltype_4": modeltype_4} action, target_prediction, input_state = model_types["modeltype_"+str(config.ensemble_model_type)](config,graph,target,sample_with_replacement,prior,posterior,mdl) prediction = graph.one_step_models[mdl](input_state,action).mean() loss = tf.reduce_mean((prediction - tf.stop_gradient(target_prediction)) ** 2, -1) loss *= config.ensemble_loss_scale objectives.append(Objective('one_step_model_'+str(mdl), loss, min, include, exclude)) else: if name=='reward_int': reconstruction_loss = heads['image'](features).log_prob(target['image']) full_model_loss = reconstruction_loss - tf.maximum(0.0, graph.cell.divergence_from_states(posterior,prior) - float(3.0)) intrinsic_target = tf.stop_gradient(-full_model_loss) intrinsic_target = tf.math.multiply(intrinsic_target,1e-3) logprob = heads[name](features).log_prob(intrinsic_target) else: logprob = heads[name](features).log_prob(target[name]) objectives.append(Objective(name, logprob, max, include, exclude)) objectives = [o._replace(value=tf.reduce_mean(o.value)) for o in objectives] return objectives
def define_model(logdir, metrics, data, trainer, config): print('Build TensorFlow compute graph.') dependencies = [] cleanups = [] step = trainer.step global_step = trainer.global_step phase = trainer.phase timestamp = tf.py_func( lambda: datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%S'), [], tf.string) dependencies.append( metrics.set_tags(global_step=global_step, step=step, phase=phase, time=timestamp)) # Instantiate network blocks. Note, this initialization would be expensive # when using tf.function since it would run at every step. try: cell = config.cell() except TypeError: cell = config.cell(action_size=data['action'].shape[-1].value) one_step_models = [] kwargs = dict(create_scope_now_=True) kwargs['encoder_feature_shape'] = config.encoder_feature_shape encoder = tf.make_template('encoder', config.encoder, **kwargs) heads = tools.AttrDict(_unlocked=True) raw_dummy_features = cell.features_from_state( cell.zero_state(1, tf.float32))[:, None] for key, head in config.heads.items(): name = 'head_{}'.format(key) kwargs = dict(create_scope_now_=True) if key in data: kwargs['data_shape'] = data[key].shape[2:].as_list() if key == 'reward_int': kwargs['data_shape'] = data['reward'].shape[2:].as_list() if key == 'action_target': kwargs['data_shape'] = data['action'].shape[2:].as_list() if key == 'curious_action': kwargs['data_shape'] = data['action'].shape[2:].as_list() if key == 'cpc': kwargs['data_shape'] = [cell.feature_size] dummy_features = encoder(data)[:1, :1] else: dummy_features = raw_dummy_features heads[key] = tf.make_template(name, head, **kwargs) heads[key](dummy_features) # Initialize weights. if config.curious_run or config.combination_run: for mdl in range(config.num_models): with tf.variable_scope('one_step_model_' + str(mdl)): name = 'one_step_model_' + str(mdl) kwargs = dict(create_scope_now_=True) kwargs['max_objective'] = config.use_max_objective if config.ensemble_model_type == 1: kwargs['data_shape'] = [config.encoder_feature_shape] elif config.ensemble_model_type == 2: kwargs['data_shape'] = [ tools.shape(cell.zero_state(1, tf.float32)['belief'])[-1] ] elif config.ensemble_model_type == 3: kwargs['data_shape'] = [ tools.shape(cell.zero_state(1, tf.float32)['sample'])[-1] ] elif config.ensemble_model_type == 4: kwargs['data_shape'] = [tools.shape(dummy_features)[-1]] kwargs['model_width_factor'] = config.model_width_factor one_step_models.append( tf.make_template(name, config.one_step_model, **kwargs)) # Update target networks. if 'value_target' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_value/.*', r'.*/head_value_target/.*', config.value_target_period, config.value_target_update)) if 'value_target_2' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_value/.*', r'.*/head_value_target_2/.*', config.value_target_period, config.value_target_update)) if 'action_target' in heads: dependencies.append( tools.track_network(trainer, config.batch_shape[0], r'.*/head_action/.*', r'.*/head_action_target/.*', config.action_target_period, config.action_target_update)) # Apply and optimize model. embedded = encoder(data) with tf.control_dependencies(dependencies): embedded = tf.identity(embedded) graph = tools.AttrDict(locals()) prior, posterior = tools.unroll.closed_loop(cell, embedded, data['action'], config.debug) objectives = utility.compute_objectives(posterior, prior, data, graph, config) summaries, grad_norms = utility.apply_optimizers(objectives, trainer, config) dependencies += summaries # Active data collection. with tf.variable_scope('collection'): with tf.control_dependencies( dependencies): # Make sure to train first. for name, params in config.train_collects.items(): schedule = tools.schedule.binary(step, config.batch_shape[0], params.steps_after, params.steps_every, params.steps_until) summary, _ = tf.cond(tf.logical_and( tf.equal(trainer.phase, 'train'), schedule), functools.partial(utility.simulate, metrics, config, params, graph, cleanups, gif_summary=False, name=name), lambda: (tf.constant(''), tf.constant(0.0)), name='should_collect_' + name) summaries.append(summary) dependencies.append(summary) # Compute summaries. graph = tools.AttrDict(locals()) summary, score = tf.cond( trainer.log, lambda: define_summaries.define_summaries(graph, config, cleanups), lambda: (tf.constant(''), tf.zeros((0, ), tf.float32)), name='summaries') summaries = tf.summary.merge([summaries, summary]) dependencies.append( utility.print_metrics({ob.name: ob.value for ob in objectives}, step, config.print_metrics_every, 2, 'objectives')) dependencies.append( utility.print_metrics(grad_norms, step, config.print_metrics_every, 2, 'grad_norms')) dependencies.append(tf.cond(trainer.log, metrics.flush, tf.no_op)) with tf.control_dependencies(dependencies): score = tf.identity(score) return score, summaries, cleanups