def feed_forward( state, data_shape, num_layers=2, activation=tf.nn.relu, mean_activation=None, stop_gradient=False, trainable=True, units=100, std=1.0, low=-1.0, high=1.0, dist='normal'): """Create a model returning unnormalized MSE distribution.""" hidden = state if stop_gradient: hidden = tf.stop_gradient(hidden) for _ in range(num_layers): hidden = tf.compat.v1.layers.dense(hidden, units, activation) mean = tf.compat.v1.layers.dense( hidden, int(np.prod(data_shape)), mean_activation, trainable=trainable) mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) if std == 'learned': std = tf.compat.v1.layers.dense( hidden, int(np.prod(data_shape)), None, trainable=trainable) std = tf.nn.softplus(std + 0.55) + 0.01 std = tf.reshape(std, tools.shape(state)[:-1] + data_shape) if dist == 'normal': dist = tfd.Normal(mean, std) elif dist == 'truncated_normal': # https://www.desmos.com/calculator/3o96eyqxib dist = tfd.TruncatedNormal(mean, std, low, high) elif dist == 'tanh_normal': # https://www.desmos.com/calculator/sxpp7ectjv dist = tfd.Normal(mean, std) dist = tfd.TransformedDistribution(dist, tfp.bijectors.Tanh()) elif dist == 'deterministic': dist = tfd.Deterministic(mean) else: raise NotImplementedError(dist) dist = tfd.Independent(dist, len(data_shape)) return dist
def cross_entropy_method(cell, objective_fn, state, obs_shape, action_shape, horizon, graph, amount=1000, topk=100, iterations=10, min_action=-1, max_action=1): num_models = graph.config.num_models obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state[0])[0])[0] initial_state = [] for mdl in range(num_models): initial_state.append( tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state[mdl])) extended_batch = tools.shape(tools.nested.flatten(initial_state[0])[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) def iteration(mean_and_stddev, _): all_model_states = [] mean, stddev = mean_and_stddev # Sample actioperformn proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) for mdl in range(num_models): (_, state), _ = tf.nn.dynamic_rnn(cell[mdl], (0 * obs, action, use_obs), initial_state=initial_state[mdl]) all_model_states.append(state) return_ = objective_fn(all_model_states) return_ = tf.reshape(return_, (original_batch, amount)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev mean = tf.zeros((original_batch, horizon) + action_shape) stddev = tf.ones((original_batch, horizon) + action_shape) if iterations < 1: return mean mean, stddev = tf.scan(iteration, tf.range(iterations), (mean, stddev), back_prop=False) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean
def cross_entropy_method(cell, objective_fn, state, obs_shape, action_shape, horizon, amount=1000, topk=100, iterations=10, discount=0.99, min_action=-1, max_action=1): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) length = tf.ones([extended_batch], dtype=tf.int32) * horizon def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) reward = objective_fn(state) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] return_ = tf.reshape(return_, (original_batch, amount)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev mean = tf.zeros((original_batch, horizon) + action_shape) stddev = tf.ones((original_batch, horizon) + action_shape) mean, stddev = tf.scan(iteration, tf.range(iterations), (mean, stddev), back_prop=False) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean
def encoder(obs, embedding_size=1024): """Extract deterministic features from an observation.""" kwargs = dict(strides=2, activation=tf.nn.relu) obs_is_dict = isinstance(obs, dict) if obs_is_dict: hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list()) else: hidden = obs img_size = hidden.shape[2:].as_list()[0] if img_size == 64: hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs) hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs) hidden = tf.layers.flatten(hidden) assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list() if embedding_size != 1024: hidden = tf.layers.dense(hidden, units=embedding_size) if obs_is_dict: hidden = tf.reshape( hidden, tools.shape(obs['image'])[:2] + [np.prod(hidden.shape[1:].as_list())]) return hidden
def encoder(obs): """Extract deterministic features from an observation.""" kwargs2 = dict(strides=2, activation=tf.nn.relu) kwargs3 = dict(strides=3, activation=tf.nn.relu) kwargs1 = dict(strides=1, activation=tf.nn.relu) kwargs = dict(strides=2, activation=tf.nn.elu) hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list() ) # (50,50,64,64,3) reshape to (2500,64,64,3) if obs_size == (32, 32): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs1) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs1) elif obs_size == (64, 64): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2) elif obs_size == (96, 96): hidden = tf.layers.conv2d(hidden, 32, 8, **kwargs) hidden = tf.layers.conv2d(hidden, 64, 5, **kwargs) hidden = tf.layers.conv2d(hidden, 72, 5, **kwargs) hidden = tf.layers.conv2d(hidden, 128, 5, **kwargs) hidden = tf.layers.conv2d(hidden, 1024, 3, strides=1) # elif obs_size == (96, 96): # # conv_base = MobileNetV2(include_top=False, input_shape=(96, 96, 3)) # # conv_base.trainable = False # # hidden = conv_base(hidden) # shape=(50, 3, 3, 1280) # hidden = tf.layers.conv2d(hidden, 32, 8, **kwargs) # hidden = tf.layers.conv2d(hidden, 64, 5, **kwargs) # hidden = tf.layers.conv2d(hidden, 72, 5, **kwargs) # hidden = tf.layers.conv2d(hidden, 128, 5, **kwargs) # # hidden = tf.layers.conv2d(hidden, 1024, 3, strides=(1, 1), activation=tf.nn.elu) # hidden = tf.layers.flatten(hidden) # hidden = tf.layers.Dense(1024, tf.nn.elu)(hidden) # # hidden = tf.layers.Dense(hidden, 1024, tf.nn.elu) elif obs_size == (128, 128): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 32, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2) hidden = tf.layers.flatten(hidden) assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list() hidden = tf.reshape( hidden, tools.shape(obs['image'])[:2] + [np.prod(hidden.shape[1:].as_list())]) return hidden # shape(50,50,1024)
def sample_pair(batch): num_sam = tools.shape(batch)[0] index = tf.range(num_sam) tgt1 = tf.slice(batch, [0, 1], [num_sam, 1]) pred1 = tf.slice(batch, [0, 0], [num_sam, 1]) def uniform(): batch2 = tf.gather(batch, tf.random.shuffle(index)) pred2 = tf.slice(batch2, [0, 0], [num_sam, 1]) tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1]) return pred1, pred2, tgt1, tgt2 return uniform
def compute_error_loss(pred1, pred2, tgt1, tgt2, hard_ratio=1.0): geq = tf.cast((tgt1 - tgt2) > 0, tf.bool) tgt_larg = tf.where(geq, tgt1, tgt2) tgt_small = tf.where(geq, tgt2, tgt1) pred_larg = tf.where(geq, pred1, pred2) pred_small = tf.where(geq, pred2, pred1) loss = tf.maximum(0., (tgt_larg - tgt_small) - (pred_larg - pred_small)) if hard_ratio < 1.0: hard_num = tf.cast(tools.shape(pred1)[0] * hard_ratio, tf.int32) loss = tf.reshape(loss, [-1]) hard_loss, _ = tf.math.top_k(loss, k=hard_num) return hard_loss return loss
def feed_forward( state, data_shape, num_layers=3, activation=None, cut_gradient=False): """Create a model returning unnormalized MSE distribution.""" hidden = state if cut_gradient: hidden = tf.stop_gradient(hidden) for _ in range(num_layers): hidden = tf.layers.dense(hidden, 100, tf.nn.relu) # e.g. state:shape(40,50,1,230)-->hidden:shape(40,50,1,100) mean = tf.layers.dense(hidden, int(np.prod(data_shape)), activation) # e.g. --> mean:shape(40,50,1,1) mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) # e.g. mean:shape(40,50,1,1) dist = tools.MSEDistribution(mean) dist = tfd.Independent(dist, len(data_shape)) return dist
def cross_entropy_method_dual2(cell, objective_fn, state, all_actions, all_reward, obs_shape, action_shape, horizon, graph, logdir, task, amount=1000, topk=100, iterations=10, min_action=-1, max_action=1, eval_ratio=0.05, env_state=None): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) def iteration(rival_actions): rival_actions = tf.squeeze(rival_actions) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, rival_actions, use_obs), initial_state=initial_state) return_ = objective_fn(state) return_ = tf.reshape(return_, (original_batch, amount, 1)) return return_ all_pred = tf.map_fn(iteration, all_actions, parallel_iterations=1) triple = tf.concat([all_reward, all_pred], 3) return triple
def decoder(state, data_shape): """Compute the data distribution of an observation from its state.""" #hidden = keras.layers.Dense(500, activation='relu')(state) #hidden = keras.layers.Dense(500, activation='relu')(hidden) #hidden = keras.layers.Dense(26)(hidden) hidden = tf.layers.dense(state, 500, tf.nn.relu) hidden = tf.layers.dense(hidden, 500, tf.nn.relu) hidden = tf.layers.dense(hidden, 26, None) mean = hidden mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) dist = tools.MSEDistribution(mean) dist = tfd.Independent(dist, len(data_shape)) return dist
def encoder(obs): """Extract deterministic features from an observation.""" kwargs = dict(strides=2, activation=tf.nn.relu) hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list()) hidden = tf.compat.v1.layers.conv2d(hidden, 32, 4, **kwargs) hidden = tf.compat.v1.layers.conv2d(hidden, 64, 4, **kwargs) hidden = tf.compat.v1.layers.conv2d(hidden, 128, 4, **kwargs) hidden = tf.compat.v1.layers.conv2d(hidden, 256, 4, **kwargs) hidden = tf.compat.v1.layers.flatten(hidden) assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list() hidden = tf.reshape(hidden, tools.shape(obs['image'])[:2] + [ np.prod(hidden.shape[1:].as_list())]) return hidden
def contra_step_lossV5(pred, tgt, resample=1): # p = tf.print('begin loss v5', [resample, pred.shape,tgt.shape]) # with tf.control_dependencies([p]): pred_flat = tf.reshape(pred, [-1]) tgt_flat = tf.reshape(tgt, [-1]) batch = tf.stack([pred_flat, tgt_flat], 1) num_sam = tools.shape(batch)[0] index = tf.range(num_sam) divider = tf.constant(resample, dtype=tf.float32) def sample_compute(cur_loss, i): batch1 = tf.gather(batch, tf.random.shuffle(index)) batch2 = tf.gather(batch, tf.random.shuffle(index)) pred1 = tf.slice(batch1, [0, 0], [num_sam, 1]) pred2 = tf.slice(batch2, [0, 0], [num_sam, 1]) tgt1 = tf.slice(batch1, [0, 1], [num_sam, 1]) tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1]) loss = cur_loss + compute_contra_loss(pred1, pred2, tgt1, tgt2) print(loss) return (loss, i + 1) # def sample_compute(i): # batch1 = tf.gather(batch, tf.random.shuffle(index)) # batch2 = tf.gather(batch, tf.random.shuffle(index)) # pred1 = tf.slice(batch1, [0, 0], [num_sam, 1]) # pred2 = tf.slice(batch2, [0, 0], [num_sam, 1]) # tgt1 = tf.slice(batch1, [0, 1], [num_sam, 1]) # tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1]) # loss = compute_contra_loss(pred1, pred2, tgt1, tgt2) # print(loss) # return loss i = tf.constant(0) loss = tf.constant(0.) final_loss = tf.while_loop(lambda l, i: i < resample, sample_compute, [loss, i])[0] # final_loss = tf.scan(sample_compute, tf.range(resample), loss)[-1] # final_loss = tf.map_fn(fn=lambda inp: sample_compute(inp), elems= tf.range(resample), dtype=tf.float32, parallel_iterations=1) # print('final', final_loss) # final_loss = loss avg_loss = tf.reduce_mean(final_loss) / divider # p = tf.print('cur_loss', [final_loss, avg_loss]) # with tf.control_dependencies([p]): # avg_loss = tf.identity(avg_loss) # print(final_loss, avg_loss) # p = tf.print('debug loss ', [final_loss, avg_loss]) # with tf.control_dependencies([p]): # avg_loss = 1. * avg_loss # print(avg_loss) # exit() return avg_loss
def compute_objectives(posterior, prior, target, graph, config): raw_features = graph.cell.features_from_state(posterior) heads = graph.heads objectives = [] for name, scale in config.loss_scales.items(): if config.loss_scales[name] == 0.0: continue if name in config.heads and name not in config.gradient_heads: features = tf.stop_gradient(raw_features) include = r'.*/head_{}/.*'.format(name) exclude = None else: features = raw_features include = r'.*' exclude = None if name == 'divergence': loss = graph.cell.divergence_from_states(posterior, prior) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('divergence', loss, min, include, exclude)) elif name == 'overshooting': shape = tools.shape(graph.data['action']) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) _, priors, posteriors, mask = tools.overshooting( graph.cell, {}, graph.embedded, graph.data['action'], length, config.overshooting_distance, posterior) posteriors, priors, mask = tools.nested.map( lambda x: x[:, :, 1:-1], (posteriors, priors, mask)) if config.os_stop_posterior_grad: posteriors = tools.nested.map(tf.stop_gradient, posteriors) loss = graph.cell.divergence_from_states(posteriors, priors) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('overshooting', loss, min, include, exclude)) else: if name == 'image': logprob = heads[name](features).log_prob( target[name][:, :, :, :, -3:]) else: logprob = heads[name](features).log_prob(target[name]) objectives.append(Objective(name, logprob, max, include, exclude)) objectives = [ o._replace(value=tf.reduce_mean(o.value)) for o in objectives ] return objectives
def decoder(state, data_shape): """Compute the data distribution of an observation from its state.""" kwargs = dict(strides=2, activation=tf.nn.relu) hidden = tf.compat.v1.layers.dense(state, 1024, None) hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1]]) hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 128, 5, **kwargs) hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 64, 5, **kwargs) hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 32, 6, **kwargs) mean = tf.compat.v1.layers.conv2d_transpose(hidden, 3, 6, strides=2) assert mean.shape[1:].as_list() == [64, 64, 3], mean.shape mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) dist = tfd.Normal(mean, 1.0) dist = tfd.Independent(dist, len(data_shape)) return dist
def feed_forward(state, data_shape, num_layers=2, activation=None, cut_gradient=False): """Create a model returning unnormalized MSE distribution.""" hidden = state if cut_gradient: hidden = tf.stop_gradient(hidden) for _ in range(num_layers): hidden = tf.layers.dense(hidden, 100, tf.nn.relu) mean = tf.layers.dense(hidden, int(np.prod(data_shape)), activation) mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) dist = tfd.Normal(mean, 1.0) dist = tfd.Independent(dist, len(data_shape)) return dist
def encoder(obs): """Extract deterministic features from an observation.""" kwargs2 = dict(strides=2, activation=tf.nn.relu) kwargs3 = dict(strides=3, activation=tf.nn.relu) kwargs1 = dict(strides=1, activation=tf.nn.relu) hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list() ) # (50,50,64,64,3) reshape to (2500,64,64,3) if obs_size == (32, 32): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs1) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs1) elif obs_size == (64, 64): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2) # elif obs_size == (128,128): # hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs3) # hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2) # hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs3) # hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2) elif obs_size == (128, 128): hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d(hidden, 32, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs1) hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2) hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2) hidden = tf.layers.flatten(hidden) assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list() hidden = tf.reshape( hidden, tools.shape(obs['image'])[:2] + [np.prod(hidden.shape[1:].as_list())]) return hidden # shape(50,50,1024)
def cpc(context, graph, posterior, predict_terms=3, negative_samples=5, hard_negative_samples=0, stack_actions=False, negative_actions=False, cpc_openloop=False, gradient_penalty=False, gpenalty_mode=0): """ :param context: shape = (batch_size, chunk_length, context_size) :param embedding: shape = (batch_size, chunk_length, embedding_size) :param gpenalty_mode: 0 is concatenated, 1 is separate, 2 is three terms :return: cross entropy loss """ # x, preds, y_true effective_horizon = context.shape[1].value - predict_terms embedding = graph.embedded actions = graph.data['action'] if cpc_openloop: shape = tools.shape(actions) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) context_to_use = get_overshoot_preds(graph, embedding, actions, length, predict_terms, posterior) context_to_use = merge_first_two_dim( context_to_use) # shape = N x predict_terms x sample_size if negative_actions: context_to_use = context_to_use[:, :, None] for _ in range(negative_samples): random_actions = tf.random.uniform(actions.shape, minval=-1, maxval=1) negative_context = get_overshoot_preds(graph, embedding, random_actions, length, predict_terms, posterior) negative_context = merge_first_two_dim(negative_context)[:, :, None] context_to_use = tf.concat([context_to_use, negative_context], axis=2) else: context_to_use = context[:, :-predict_terms, :] context_to_use = tf.reshape(context_to_use, [-1] + context_to_use.shape[2:].as_list()) if stack_actions: future_actions = tf.stack( [ tf.reshape(actions[:, i:i + predict_terms], (actions.shape[0].value, -1)) for i in range(effective_horizon) ], axis=1 ) # batch x effective_horizon x (predict_terms * action_dim) assert future_actions.shape[1].value == effective_horizon future_actions = merge_first_two_dim(future_actions) future_actions = tf.layers.dense(future_actions, units=256, activation='relu') future_actions = tf.layers.dense(future_actions, units=256, activation='relu') future_actions = tf.layers.dense( future_actions, units=30, activation='linear') # 30 is the size of the state space if negative_actions: image_context = context_to_use context_to_use = tf.concat([image_context, future_actions], axis=-1)[:, None] for _ in range(negative_samples): current_context = tf.concat([ image_context, tf.random.uniform( future_actions.shape, minval=-1, maxval=1) ], axis=-1) context_to_use = tf.concat( [context_to_use, current_context[:, None]], axis=1) # shape (N x negatives x action_dim) else: context_to_use = tf.concat([context_to_use, future_actions], axis=-1) reward = graph.data['reward'][:, :, None] x, y_true = format_cpc_data(context_to_use, embedding, predict_terms, negative_samples, num_hard_negatives=hard_negative_samples, negative_actions=negative_actions) _, reward_y_true = format_cpc_data(context_to_use, reward, predict_terms, negative_samples, negative_actions=negative_actions) code_size = embedding.shape[-1].value if cpc_openloop: preds = network_prediction_openloop(x, code_size, predict_terms) reward_preds = network_prediction_openloop(x, 1, predict_terms, name='reward') else: preds, kernels = network_prediction(x, code_size, predict_terms) reward_preds, _ = network_prediction(x, 1, predict_terms, name='reward') if negative_actions: logits = cpc_layer(y_true, preds) reward_logits = cpc_layer(reward_y_true, reward_preds) else: logits = cpc_layer(preds, y_true) reward_logits = cpc_layer(reward_preds, reward_y_true) labels_zero = tf.zeros(dtype=tf.float32, shape=(x.shape[0], predict_terms, negative_samples)) labels_one = tf.ones(dtype=tf.float32, shape=(x.shape[0], predict_terms, 1)) labels = tf.concat([labels_one, labels_zero], axis=-1) loss = cross_entropy_loss(labels, logits) acc = calc_acc(labels, logits) reward_loss = cross_entropy_loss(labels, reward_logits) reward_acc = calc_acc(labels, reward_logits) if gradient_penalty: gpenalty = tf.constant(0, dtype=tf.float32) # for i in range(predict_terms): # for j in range(negative_samples + 1): # grad = tf.gradients(logits[:, i, j], [x, y_true]) # grad_concat = tf.concat([tf.contrib.layers.flatten(grad[0]), # tf.contrib.layers.flatten(grad[1][:, i, j])], # axis=-1) # gpenalty += tf.reduce_mean(tf.pow(tf.norm(grad_concat, axis=-1) - 1, 2)) batch_size, horizon = graph.data['reward'].shape.as_list() effective_horizon = horizon - predict_terms f = tf.reshape(logits[:, :, 0], shape=(batch_size, effective_horizon, predict_terms)) s_t = x o_tpk = graph.data['image'] counter = 0 for k in range(predict_terms): matrix = kernels[k] z_tk = merge_first_two_dim(embedding[:, 1 + k:1 + k + effective_horizon]) wk_d_ct = tf.transpose( tf.linalg.matmul(matrix, s_t, transpose_b=True)) wk_d_ztk = tf.transpose( tf.linalg.matmul(matrix, z_tk, transpose_a=True, transpose_b=True)) grad = tf.concat([wk_d_ct, wk_d_ztk], axis=-1) if gpenalty_mode == 0: gpenalty += tf.reduce_mean( tf.reduce_sum(tf.square(wk_d_ct), axis=-1)) gpenalty += tf.reduce_mean( tf.reduce_sum(tf.square(wk_d_ztk), axis=-1)) elif gpenalty_mode == 1: gpenalty += tf.reduce_mean( tf.pow(tf.norm(grad, axis=-1) - 1, 2)) elif gpenalty_mode == 2: gpenalty += tf.reduce_mean(tf.square(matrix)) else: print("gpenalty mode not supported!") # gpenalty /= predict_terms if gpenalty_mode == 2: gpenalty += tf.reduce_mean(tf.square(s_t)) gpenalty += tf.reduce_mean(tf.square(embedding)) return loss, acc, reward_loss, reward_acc, gpenalty, kernels return loss, acc, reward_loss, reward_acc, 0., None if cpc_openloop else kernels
def cross_entropy_method(cell, objective_fn, state, obs_shape, action_shape, horizon, amount=1000, topk=100, iterations=10, discount=0.99, min_action=-1, max_action=1, command=1): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) length = tf.ones([extended_batch], dtype=tf.int32) * horizon def iteration(mean_and_stddev, _): mean, stddev, command = mean_and_stddev # mean 1 12 2 # stddev 1 12 2 # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) # action # Tensor( # "graph/collection/should_collect_carla/simulate-1/train-carla-cem-12/scan/while/simulate/scan/while/Reshape:0", # shape=(1000, 12, 2), dtype=float32) reward = objective_fn(state) bond_turn = tf.reshape(tf.reduce_sum(action[:, :, 1], axis=1), [1, 1000]) bond_turn = tf.clip_by_value(bond_turn, -10, 10) bond_keep = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1), [1, 1000]) bond_straight = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1), [1, 1000]) - \ 0.2*tf.reshape(tf.reduce_sum(tf.abs(action[:, :, 1]), axis=1), [1, 1000]) bond_straight = tf.clip_by_value(bond_straight, -8, 8) bond_keep = tf.clip_by_value(bond_keep, -8, 8) def f1(): return bond_straight # go straight bond def f2(): return bond_turn + 0.2 * bond_keep # right turn bond def f3(): return -bond_turn + 0.2 * bond_keep # left turn bond def f4(): return bond_keep # lane keep bond bond = tf.case( { tf.reduce_all(tf.equal(command, 2)): f2, tf.reduce_all(tf.equal(command, 3)): f3, tf.reduce_all(tf.equal(command, 4)): f4 }, default=f1, exclusive=True) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] return_ = tf.reshape(return_, (original_batch, amount)) if PLAN_BOND: return_ += bond * 0.2 # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev, command mean = tf.zeros((original_batch, horizon) + action_shape) # print('>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<\n'*10) def f1(): x = tf.concat([mean[:, :, 0] + 0.6, mean[:, :, 1]], 0) return tf.expand_dims(tf.transpose(x), 0) def f2(): x = tf.concat([mean[:, :, 0] + 0.3, mean[:, :, 1] + 0.3], 0) return tf.expand_dims(tf.transpose(x), 0) def f3(): x = tf.concat([mean[:, :, 0] + 0.3, mean[:, :, 1] - 0.3], 0) return tf.expand_dims(tf.transpose(x), 0) command = tf.reshape(command, (1, -1)) if PLAN_BIAS: mean = tf.case( { tf.reduce_all(tf.equal(command, 2)): f2, tf.reduce_all(tf.equal(command, 3)): f3 }, default=f1, exclusive=True) stddev = tf.ones((original_batch, horizon) + action_shape) mean, stddev, command = tf.scan( iteration, tf.range(iterations), (mean, stddev, command * tf.ones([1, 12, 2], dtype=tf.float32)), back_prop=False) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean
def cross_entropy_method(cell, objective_fn, state, obs_shape, action_shape, horizon, amount=1000, topk=100, iterations=10, discount=0.99, min_action=-1, max_action=1, discrete_action=False): # Embedded observation and action shapes without batch dim. # In Atari case `action_shape` is number of discrete actions obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) # Flatten state dict to get first element and then get envs batch size original_batch = tools.shape(tools.nested.flatten(state)[0])[0] # It multiplies state's batch size `amount` times so each candidate starts in the same # (batched) env. It means that we spawn `amount` candidates for each env in batch initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) # Again, logic to get state's batch size, but this time the extended one extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] # Any candidate don't use observation at any sequence step (it's open loop simulation) use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) # [batch, sequence, 1] obs = tf.zeros((extended_batch, horizon) + obs_shape) length = tf.ones([extended_batch], dtype=tf.int32) * horizon def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief for each env in batch, candidate and horizon step normal = tf.random_normal((original_batch, amount, horizon) + action_shape) # Action shape: (envs batch size, candidates amount, horizon) + action_shape action = normal * stddev[:, None] + mean[:, None] # Reshape to extended_batch format (original_batch * amount, horizon) + action_shape action = tf.reshape(action, (extended_batch, horizon) + action_shape) if discrete_action: # Normalize action scores action = tf.nn.l2_normalize(action, axis=-1) # Apply greedy policy postproc_action = greedy(action, action_shape[0]) else: # Clip action to valid range action = tf.clip_by_value(action, min_action, max_action) # Keep continuous actions postproc_action = action # Evaluate proposal actions (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, postproc_action, use_obs), initial_state=initial_state) reward = objective_fn(state) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] # Reshape back to (envs batch size, candidates amount) format return_ = tf.reshape(return_, (original_batch, amount)) # Indices have shape (envs batch size, topk) and those are candidates indices # for each env in the batch! _, indices = tf.nn.top_k(return_, topk, sorted=False) # Offset each index so it matches indices of action which has `extended_batch` first dim. indices += tf.range(original_batch)[:, None] * amount # best_actions have shape indices.shape + action.shape[1:], which is # (envs batch size, topk, horizon) + action_shape best_actions = tf.gather(action, indices) # Calculate new belief from best actions, shape: (envs batch size, horizon) + action_shape mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev # Initialize belief over actions (zero mean and unit variance) mean = tf.zeros((original_batch, horizon) + action_shape) stddev = tf.ones((original_batch, horizon) + action_shape) # Run optimisation mean, _ = tf.scan(iteration, tf.range(iterations), (mean, stddev), back_prop=False) # Select belief at last iterations mean = mean[-1] # Take only first action, shape: (envs batch size,) + action_shape mean = mean[:, 0] if discrete_action: # Apply greedy policy return greedy(mean, action_shape[0]) else: # Return continuous actions return mean
def decoder(state, data_shape): """Compute the data distribution of an observation from its state.""" kwargs2 = dict(strides=2, activation=tf.nn.relu) kwargs3 = dict(strides=3, activation=tf.nn.relu) kwargs1 = dict(strides=1, activation=tf.nn.relu) hidden = tf.layers.dense(state, 1024, None) hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1].value]) if obs_size == (32, 32): hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, 64, 4, **kwargs2) hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, num_channels_x, 4, strides=2) elif obs_size == (64, 64): hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs2) hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs2) hidden = tf.layers.conv2d_transpose(hidden, 32, 6, **kwargs2) hidden = tf.layers.conv2d_transpose(hidden, num_channels_x, 6, strides=2) # elif obs_size == (128,128): # hidden = tf.layers.conv2d_transpose(hidden, 128, 6, **kwargs2) # hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs3) # hidden = tf.layers.conv2d_transpose(hidden, 32, 5, **kwargs3) # hidden = tf.layers.conv2d_transpose(hidden, num_channels_x, 6, strides=2) elif obs_size == (128, 128): hidden = tf.layers.conv2d_transpose(hidden, 256, 4, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, 256, 4, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, 128, 4, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, 128, 3, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, 64, 4, **kwargs2) hidden = tf.layers.conv2d_transpose( hidden, 64, 4, **kwargs1) # ~= pixels * stride + kernel_size hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs2) hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs1) hidden = tf.layers.conv2d_transpose(hidden, num_channels_x, 4, strides=2) mean = hidden if obs_size == (32, 32): assert mean.shape[1:].as_list() == [32, 32, num_channels_x], mean.shape elif obs_size == (64, 64): assert mean.shape[1:].as_list() == [64, 64, num_channels_x], mean.shape elif obs_size == (128, 128): assert mean.shape[1:].as_list() == [128, 128, num_channels_x], mean.shape mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape) dist = tools.MSEDistribution(mean) dist = tfd.Independent(dist, len(data_shape)) return dist
def cross_entropy_method_dual1(cell, objective_fn, state, obs_shape, action_shape, horizon, graph, logdir, task, amount=1000, topk=100, iterations=10, min_action=-1, max_action=1, eval_ratio=0.05, env_state=None): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) def gd_eval(state, actions, preds, logdir): # env_ctor = functools.partial(create_env, domain='cheetah', task='run', repeat=repeat) from planet.control import wrappers repeat = wrappers.get_repeat(task_str=task[0]) env_ctor = functools.partial(create_simple_env, task_str=task[0], repeat=repeat) evaluator = AsyncEvaluator(env_ctor, 30) name = 'cem_traj' path = os.path.join(logdir, name + '.npy') def eval(state, actions, preds): if evaluator.isclosed(): print('evaluator closed, reopen now') evaluator.reopen(env_ctor, 10) promises = [] for act in actions: promise = evaluator(state, act) promises.append(promise) gds = [promise() for promise in promises] # gds = np.stack(gds, 0).astype(np.float32) gds = np.array(gds).astype(np.float32) batch = np.stack((gds, preds), 1) batch = np.expand_dims(batch, 0) if os.path.exists(path): prev = np.load(path) new = np.concatenate((prev, batch), 0) else: new = batch if new.shape[0] > 1000 * 10 / repeat - 300: print('end of task, close it') evaluator.close() # exit() # batch = np.sum(batch, -2) print(new.shape) np.save(path, new) return batch all_reward = tf.py_func(eval, inp=[state, actions, preds], Tout=tf.float32) return all_reward def iteration(mean_and_stddev, id): mean, stddev, collect, _, _ = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] all_action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions by latent imagination action = tf.reshape(all_action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) return_ = objective_fn(state) all_reward = tf.cond( collect, lambda: gd_eval(env_state, action, return_, logdir), lambda: tf.zeros((original_batch, amount, 2))) all_reward = tf.reshape(all_reward, (original_batch, amount, 2)) with tf.control_dependencies([all_reward]): return_ = tf.reshape(return_, (original_batch, amount)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev, collect, all_action, all_reward mean = tf.zeros((original_batch, horizon) + action_shape) stddev = tf.ones((original_batch, horizon) + action_shape) all_action = tf.zeros((original_batch, amount, horizon) + action_shape) all_reward = tf.zeros((original_batch, amount, 2)) if iterations < 1: return mean collect = tf.cond( tf.random_uniform((), dtype=tf.float32) < eval_ratio, lambda: tf.ones( (), tf.bool), lambda: tf.zeros((), tf.bool)) a = tf.print(collect) with tf.control_dependencies([a]): mean, stddev, collect, all_action, all_reward = tf.scan( iteration, tf.range(iterations), (mean, stddev, collect, all_action, all_reward), back_prop=False) # print(mean, stddev, collect, all_action, all_reward ) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean, collect, all_action, all_reward
def simulator_planner(cell, objective_fn, state, obs_shape, action_shape, horizon, graph, logdir, task, amount=1000, topk=100, iterations=10, min_action=-1, max_action=1, eval_ratio=0.05, env_state=None): obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape(tools.nested.flatten(state)[0])[0] initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) def gd_eval(state, actions): from planet.control import wrappers repeat = wrappers.get_repeat(task_str=task[0]) env_ctor = functools.partial(create_simple_env, task_str=task[0], repeat=repeat) evaluator = AsyncEvaluator(env_ctor, 30) def eval(state, actions): if evaluator.isclosed(): print('evaluator closed, reopen now') evaluator.reopen(env_ctor, 10) promises = [] for act in actions: promise = evaluator(state, act) promises.append(promise) gds = [promise() for promise in promises] gds = np.stack(gds, 0).astype(np.float32) # print(gds.shape) return gds op = tf.py_func(eval, inp=[state, actions], Tout=tf.float32) return op def iteration(mean_and_stddev, id): mean, stddev, collect = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions by latent imagination action = tf.reshape(action, (extended_batch, horizon) + action_shape) # reward = objective_fn(state) reward = gd_eval(env_state, action) # simu_eval = gd_eval(env_state, action, reward, logdir) return_ = tf.reduce_sum(reward, 1) return_ = tf.reshape(return_, (original_batch, amount)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance) return mean, stddev, collect mean = tf.zeros((original_batch, horizon) + action_shape) stddev = tf.ones((original_batch, horizon) + action_shape) if iterations < 1: return mean collect = tf.cond( tf.random_uniform((), dtype=tf.float32) < eval_ratio, lambda: tf.ones( (), tf.bool), lambda: tf.zeros((), tf.bool)) mean, stddev, _ = tf.scan(iteration, tf.range(iterations), (mean, stddev, collect), back_prop=False) # a = tf.print('fffff ', env_state, mean, stddev) # with tf.control_dependencies([a]): # mean = tf.identity(mean) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean
def compute_objectives(posterior, prior, target, graph, config, trainer): raw_features = graph.cell.features_from_state(posterior) heads = graph.heads objectives = [] summaries = [] cstr_pct = 0.0 for name, scale in config.loss_scales.items(): if config.loss_scales[name] == 0.0: continue if name in config.heads and name not in config.gradient_heads: features = tf.stop_gradient(raw_features) include = r'.*/head_{}/.*'.format(name) exclude = None else: features = raw_features include = r'.*' exclude = None if name == 'divergence': loss = graph.cell.divergence_from_states(posterior, prior) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('divergence', loss, min, include, exclude)) elif name == 'overshooting': shape = tools.shape(graph.data['action']) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) _, priors, posteriors, mask = tools.overshooting( graph.cell, {}, graph.embedded, graph.data['action'], length, config.overshooting_distance, posterior) posteriors, priors, mask = tools.nested.map( lambda x: x[:, :, 1:-1], (posteriors, priors, mask)) if config.os_stop_posterior_grad: posteriors = tools.nested.map(tf.stop_gradient, posteriors) loss = graph.cell.divergence_from_states(posteriors, priors) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('overshooting', loss, min, include, exclude)) elif name == 'reward' and config.r_loss == 'contra': pred = heads[name](features) if config.contra_unit == 'traj': print('Using traj loss') contra_loss, cstr_pct = contra_traj_lossV6( pred, target[name], horizon=config.contra_horizon) elif config.contra_unit == 'weighted': print('Using weighted trajectory loss ', config.contra_horizon) contra_loss, cstr_pct = contra_traj_lossV7( pred, target[name], horizon=config.contra_horizon, temp=config.temp) elif config.contra_unit == 'simclr': print('Using simclr trajectory loss ', config.contra_horizon) contra_loss, cstr_pct = contra_traj_lossV8( pred, target[name], horizon=config.contra_horizon) elif config.contra_unit == 'rank': print('Using ranking trajectory loss ', config.contra_horizon) contra_loss, cstr_pct = contra_traj_lossV9( pred, target[name], horizon=config.contra_horizon, margin=config.margin) objectives.append((Objective(name, contra_loss, min, include, exclude))) elif name == 'reward' and config.r_loss == 'l2': pred = heads[name](features) l2_loss = tf.compat.v1.losses.mean_squared_error( target[name], pred) # l2_loss = tf.nn.l2_loss(pred - target[name]) objectives.append((Objective(name, l2_loss, min, include, exclude))) else: if not config.aug_same and config.aug: recon_feat = tf.concat([features, target['aug']], -1) print('Use recon feature ', name, recon_feat) logprob = heads[name](recon_feat).log_prob(target[name]) # logprob = heads[name](features).log_prob(target['ori_img']) else: logprob = heads[name](features).log_prob(target[name]) objectives.append(Objective(name, logprob, max, include, exclude)) objectives = [ o._replace(value=tf.reduce_mean(o.value)) for o in objectives ] return objectives, cstr_pct
def compute_objectives(posterior, prior, target, graph, config): raw_features = graph.cell.features_from_state(posterior) heads = graph.heads objectives = [] cpc_logs = {} for name, scale in config.loss_scales.items(): if config.loss_scales[name] == 0.0: continue if name in config.heads and name not in config.gradient_heads: features = tf.stop_gradient(raw_features) include = r'.*/head_{}/.*'.format(name) exclude = None else: features = raw_features include = r'.*' exclude = None if name == 'divergence': loss = graph.cell.divergence_from_states(posterior, prior) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('divergence', loss, min, include, exclude)) elif name == 'latent_prior': num_actions = 10 prev_states_flattened = tools.nested.map( lambda x: tf.reshape(x, (-1, x.shape[-1].value)), posterior) prev_states = tools.nested.map( lambda x: tf.tile(x, multiples=(num_actions, 1)), prev_states_flattened) batch_size = prev_states['sample'].shape[0].value prev_action = tf.random.uniform( (batch_size, graph.data['action'].shape[-1].value), minval=-1, maxval=1) obs = tf.zeros(shape=[ batch_size, ] + graph.embedded.shape[2:].as_list()) use_obs = tf.zeros((batch_size, 1), tf.bool) (next_states, _), _ = graph.cell((obs, prev_action, use_obs), prev_states) if not config.latent_prior_marginal: loss = graph.cell.divergence_from_states( prev_states, next_states) else: samples_next_state = tf.reshape( next_states['sample'], shape=(batch_size // num_actions, num_actions, -1)) samples_next_state_mean = tf.reduce_mean(samples_next_state, axis=1) samples_current_state = tf.stop_gradient( prev_states_flattened['sample']) loss = tf.reduce_mean( tf.reduce_sum(tf.square(samples_next_state_mean - samples_current_state), axis=-1)) objectives.append( Objective('latent_prior', loss, min, include, exclude)) elif name == 'embedding_l2': loss = tf.reduce_mean( tf.reduce_sum(tf.square(graph.embedded), axis=-1)) objectives.append( Objective('embedding_l2', loss, min, include, exclude)) elif name == 'overshooting': shape = tools.shape(graph.data['action']) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) _, priors, posteriors, mask = tools.overshooting( graph.cell, {}, graph.embedded, graph.data['action'], length, config.overshooting_distance, posterior) posteriors, priors, mask = tools.nested.map( lambda x: x[:, :, 1:-1], (posteriors, priors, mask)) if config.os_stop_posterior_grad: posteriors = tools.nested.map(tf.stop_gradient, posteriors) loss = graph.cell.divergence_from_states(posteriors, priors) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('overshooting', loss, min, include, exclude)) elif name == 'cpc': loss, acc, reward_loss, reward_acc, gpenalty, kernels = networks.\ cpc(features if config.include_belief else posterior['sample'], graph, posterior, predict_terms=config.future, negative_samples=config.negatives, hard_negative_samples=config.hard_negatives, stack_actions=config.stack_actions, negative_actions=config.negative_actions, cpc_openloop=config.cpc_openloop, gradient_penalty=config.cpc_gpenalty_scale > 0, gpenalty_mode=config.gpenalty_mode) loss += reward_loss * config.cpc_reward_scale loss += gpenalty * config.cpc_gpenalty_scale objectives.append(Objective('cpc', loss, min, include, exclude)) cpc_logs['acc'] = acc cpc_logs['reward_acc'] = reward_acc cpc_logs['gpenalty'] = gpenalty if kernels: for i in range(config.future): cpc_logs['W_mag%d' % i] = tf.reduce_mean( tf.square(kernels[i])) elif name == 'inverse_model': loss, acc = networks.inverse_model( features, graph, contrastive=config.action_contrastive, negative_samples=config.negatives) objectives.append( Objective('inverse_model', loss, min, include, exclude)) if config.action_contrastive: cpc_logs['inverse_model_acc'] = acc else: logprob = heads[name](features).log_prob(target[name]) objectives.append(Objective(name, logprob, max, include, exclude)) objectives = [ o._replace(value=tf.reduce_mean(o.value)) for o in objectives ] return objectives, cpc_logs
def cross_entropy_method( cell, objective_fn, state, info_cmd, obs_shape, action_shape, horizon, amount=1000, topk=100, iterations=10, discount=0.99, min_action=-1, max_action=1 ): # state,info_cmd: shape(num_envs,4): next_command_id, goal_heading_degree, current_heading_degree,dist_to_intersection obs_shape, action_shape = tuple(obs_shape), tuple(action_shape) original_batch = tools.shape( tools.nested.flatten(state)[0])[0] # original_batch: num_envs initial_state = tools.nested.map( lambda tensor: tf.tile(tensor, [amount] + [1] * (tensor.shape.ndims - 1)), state) extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0] use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool) obs = tf.zeros((extended_batch, horizon) + obs_shape) length = tf.ones([extended_batch], dtype=tf.int32) * horizon # info_cmd components info_cmd = tf.squeeze(info_cmd) # shape(3,) cmd_id, goal_heading_degree, current_heading_degree, dist_to_intersection = info_cmd[ 0], info_cmd[1], info_cmd[2], info_cmd[3] def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) # objectives objectives = objective_fn( state ) # shape: ['reward':shape(1000,12), 'angular_speed_degree':shape(1000,12), ...] reward = objectives['reward'] angular_speed = objectives['angular_speed_degree'] forward_speed = objectives['forward_speed'] / 10.0 collided = objectives['collided'] intersection_offroad = objectives['intersection_offroad'] intersection_otherlane = objectives['intersection_otherlane'] # ################# #1. define reward for planning # return_ = discounted_return.discounted_return( # reward, length, discount)[:, 0] # total_return = tf.reshape(return_, (original_batch, amount)) if not PLANNING: ################## #2. define reward for planning return_ = discounted_return.discounted_return( reward, length, discount)[:, 0] # shape: (1000,) return_ = tf.reshape(return_, (original_batch, amount)) # shape: (1, 1000) # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0) threshold_degree = tf.where(dist_to_intersection < 9, 9 * (9 - dist_to_intersection), 0) angular_turn_ = discounted_return.discounted_return( angular_speed, length, 1.0)[:, 0] # shape: (1000,) # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0] # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1) heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \ tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0) heading_loss_weighted = heading_loss * tf.where( heading_loss > threshold_degree - 90, tf.ones((amount, )) * 0.3, tf.ones((amount, )) * 1000.0) # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs return_heading = tf.reshape(heading_loss_weighted, (original_batch, amount)) total_return = return_ + return_heading # /90.0*12*4 if PLANNING: ################## #3. define reward for planning rewards = forward_speed - 300.0 * tf.where( collided > 0.3, collided, tf.ones_like(collided) * 0.0 ) - 20.0 * intersection_offroad - 10.0 * intersection_otherlane return_ = discounted_return.discounted_return( rewards, length, discount)[:, 0] # shape: (1000,) return_ = tf.reshape(return_, (original_batch, amount)) # shape: (1, 1000) # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0) threshold_degree = tf.where(dist_to_intersection < 9, 9 * (9 - dist_to_intersection), 0) angular_turn_ = discounted_return.discounted_return( angular_speed, length, 1.0)[:, 0] # shape: (1000,) # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0] # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1) heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \ tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0) heading_loss_weighted = heading_loss * tf.where( heading_loss > threshold_degree - 90, tf.ones((amount, )) * 0.3, tf.ones((amount, )) * 1000.0) # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs return_heading = tf.reshape(heading_loss_weighted, (original_batch, amount)) total_return = return_ + return_heading # /90.0*12*4 # Re-fit belief to the best ones. _, indices = tf.nn.top_k(total_return, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev '''COMMAND_ORDINAL = { "REACH_GOAL": 0, "GO_STRAIGHT": 1, "TURN_RIGHT": 2, "TURN_LEFT": 3, "LANE_FOLLOW": 4 } ''' # compute action_bias f_0 = lambda: tf.constant([0.0, 0.0]) # [throttle, steer(l-,r+)] f_1eft = lambda: tf.constant([0.0, -0.5]) f_right = lambda: tf.constant([0.0, 0.5]) pred_func = {tf.equal(cmd_id, 3): f_1eft, tf.equal(cmd_id, 2): f_right} action_bias = tf.case(pred_func, default=f_0) # # compute angular clue # angular_f_0 = lambda: tf.constant(0.0) # [throttle, steer(l-,r+)] # angular_f_1eft = lambda: tf.constant(-3.0) # angular_f_right = lambda: tf.constant(3.0) # # angular_pred_func = { tf.equal(cmd_id,3):angular_f_1eft, tf.equal(cmd_id,2):angular_f_right, tf.equal(cmd_id,1):angular_f_0 } # angular_clue = tf.case(angular_pred_func, default=angular_f_0) mean = tf.zeros((original_batch, horizon) + action_shape) # + action_bias stddev = tf.ones((original_batch, horizon) + action_shape) mean, stddev = tf.scan(iteration, tf.range(iterations), (mean, stddev), back_prop=False) mean, stddev = mean[-1], stddev[-1] # Select belief at last iterations. return mean
def compute_objectives(posterior, prior, target, graph, config): heads = graph.heads objectives = [] for name, scale in config.loss_scales.items(): features = [] if config.loss_scales[name] == 0.0: continue if name in config.heads and name not in config.gradient_heads: for mdl in range(len(posterior)): raw_features = graph.cell[mdl].features_from_state( posterior[mdl]) features.append(tf.stop_gradient(raw_features)) include = r'.*/head_{}/.*'.format(name) exclude = None else: for mdl in range(len(posterior)): raw_features = graph.cell[mdl].features_from_state( posterior[mdl]) features.append(raw_features) include = r'.*' exclude = None if name == 'divergence': loss = graph.cell[0].divergence_from_states(posterior[0], prior[0]) for mdl in range(1, len(posterior)): loss = tf.math.add( loss, graph.cell[mdl].divergence_from_states( posterior[mdl], prior[mdl])) loss = tf.math.scalar_mul((1 / len(posterior)), loss) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('divergence', loss, min, include, exclude)) elif name == 'overshooting': assert name != 'overshooting' #Didn't change overshooting to include ensembles shape = tools.shape(graph.data['action']) length = tf.tile(tf.constant(shape[1])[None], [shape[0]]) _, priors, posteriors, mask = tools.overshooting( graph.cell[mdl], {}, graph.embedded[mdl], graph.data['action'], length, config.overshooting_distance, posterior) posteriors, priors, mask = tools.nested.map( lambda x: x[:, :, 1:-1], (posteriors, priors, mask)) if config.os_stop_posterior_grad: posteriors = tools.nested.map(tf.stop_gradient, posteriors) loss = graph.cell[mdl].divergence_from_states(posteriors, priors) if config.free_nats is not None: loss = tf.maximum(0.0, loss - float(config.free_nats)) objectives.append( Objective('overshooting', loss, min, include, exclude)) else: bootstrap_target = tf.gather(target[name], graph.sample_with_replacement[0, :], axis=0) logprob = heads[name](features[0]).log_prob(bootstrap_target) for mdl in range(1, len(posterior)): bootstrap_target = tf.gather( target[name], graph.sample_with_replacement[mdl, :], axis=0) logprob = tf.math.add( logprob, heads[name](features[mdl]).log_prob(bootstrap_target)) logprob = tf.math.scalar_mul((1 / len(posterior)), logprob) objectives.append(Objective(name, logprob, max, include, exclude)) print(objectives) objectives = [ o._replace(value=tf.reduce_mean(o.value)) for o in objectives ] #assert 1==2 return objectives