示例#1
0
def feed_forward(
        state, data_shape, num_layers=2, activation=tf.nn.relu,
        mean_activation=None, stop_gradient=False, trainable=True, units=100,
        std=1.0, low=-1.0, high=1.0, dist='normal'):
    """Create a model returning unnormalized MSE distribution."""
    hidden = state
    if stop_gradient:
        hidden = tf.stop_gradient(hidden)
    for _ in range(num_layers):
        hidden = tf.compat.v1.layers.dense(hidden, units, activation)
    mean = tf.compat.v1.layers.dense(
        hidden, int(np.prod(data_shape)), mean_activation, trainable=trainable)
    mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)
    if std == 'learned':
        std = tf.compat.v1.layers.dense(
            hidden, int(np.prod(data_shape)), None, trainable=trainable)
        std = tf.nn.softplus(std + 0.55) + 0.01
        std = tf.reshape(std, tools.shape(state)[:-1] + data_shape)
    if dist == 'normal':
        dist = tfd.Normal(mean, std)
    elif dist == 'truncated_normal':
        # https://www.desmos.com/calculator/3o96eyqxib
        dist = tfd.TruncatedNormal(mean, std, low, high)
    elif dist == 'tanh_normal':
        # https://www.desmos.com/calculator/sxpp7ectjv
        dist = tfd.Normal(mean, std)
        dist = tfd.TransformedDistribution(dist, tfp.bijectors.Tanh())
    elif dist == 'deterministic':
        dist = tfd.Deterministic(mean)
    else:
        raise NotImplementedError(dist)
    dist = tfd.Independent(dist, len(data_shape))
    return dist
示例#2
0
def cross_entropy_method(cell,
                         objective_fn,
                         state,
                         obs_shape,
                         action_shape,
                         horizon,
                         graph,
                         amount=1000,
                         topk=100,
                         iterations=10,
                         min_action=-1,
                         max_action=1):
    num_models = graph.config.num_models
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state[0])[0])[0]
    initial_state = []
    for mdl in range(num_models):
        initial_state.append(
            tools.nested.map(
                lambda tensor: tf.tile(tensor, [amount] + [1] *
                                       (tensor.shape.ndims - 1)), state[mdl]))
    extended_batch = tools.shape(tools.nested.flatten(initial_state[0])[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)

    def iteration(mean_and_stddev, _):
        all_model_states = []
        mean, stddev = mean_and_stddev
        # Sample actioperformn proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions.
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        for mdl in range(num_models):
            (_, state), _ = tf.nn.dynamic_rnn(cell[mdl],
                                              (0 * obs, action, use_obs),
                                              initial_state=initial_state[mdl])
            all_model_states.append(state)

        return_ = objective_fn(all_model_states)
        return_ = tf.reshape(return_, (original_batch, amount))
        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev

    mean = tf.zeros((original_batch, horizon) + action_shape)
    stddev = tf.ones((original_batch, horizon) + action_shape)
    if iterations < 1:
        return mean
    mean, stddev = tf.scan(iteration,
                           tf.range(iterations), (mean, stddev),
                           back_prop=False)
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean
示例#3
0
def cross_entropy_method(cell,
                         objective_fn,
                         state,
                         obs_shape,
                         action_shape,
                         horizon,
                         amount=1000,
                         topk=100,
                         iterations=10,
                         discount=0.99,
                         min_action=-1,
                         max_action=1):
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)
    length = tf.ones([extended_batch], dtype=tf.int32) * horizon

    def iteration(mean_and_stddev, _):
        mean, stddev = mean_and_stddev
        # Sample action proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions.
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs),
                                          initial_state=initial_state)
        reward = objective_fn(state)
        return_ = discounted_return.discounted_return(reward, length,
                                                      discount)[:, 0]
        return_ = tf.reshape(return_, (original_batch, amount))
        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev

    mean = tf.zeros((original_batch, horizon) + action_shape)
    stddev = tf.ones((original_batch, horizon) + action_shape)
    mean, stddev = tf.scan(iteration,
                           tf.range(iterations), (mean, stddev),
                           back_prop=False)
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean
示例#4
0
def encoder(obs, embedding_size=1024):
    """Extract deterministic features from an observation."""
    kwargs = dict(strides=2, activation=tf.nn.relu)
    obs_is_dict = isinstance(obs, dict)
    if obs_is_dict:
        hidden = tf.reshape(obs['image'],
                            [-1] + obs['image'].shape[2:].as_list())
    else:
        hidden = obs
    img_size = hidden.shape[2:].as_list()[0]
    if img_size == 64:
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs)
    hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs)
    hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs)
    hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs)
    hidden = tf.layers.flatten(hidden)
    assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list()
    if embedding_size != 1024:
        hidden = tf.layers.dense(hidden, units=embedding_size)
    if obs_is_dict:
        hidden = tf.reshape(
            hidden,
            tools.shape(obs['image'])[:2] +
            [np.prod(hidden.shape[1:].as_list())])
    return hidden
示例#5
0
def encoder(obs):
    """Extract deterministic features from an observation."""
    kwargs2 = dict(strides=2, activation=tf.nn.relu)
    kwargs3 = dict(strides=3, activation=tf.nn.relu)
    kwargs1 = dict(strides=1, activation=tf.nn.relu)
    kwargs = dict(strides=2, activation=tf.nn.elu)
    hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list()
                        )  # (50,50,64,64,3) reshape to (2500,64,64,3)

    if obs_size == (32, 32):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs1)

    elif obs_size == (64, 64):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2)

    elif obs_size == (96, 96):
        hidden = tf.layers.conv2d(hidden, 32, 8, **kwargs)
        hidden = tf.layers.conv2d(hidden, 64, 5, **kwargs)
        hidden = tf.layers.conv2d(hidden, 72, 5, **kwargs)
        hidden = tf.layers.conv2d(hidden, 128, 5, **kwargs)
        hidden = tf.layers.conv2d(hidden, 1024, 3, strides=1)
    # elif obs_size == (96, 96):
    #   # conv_base = MobileNetV2(include_top=False, input_shape=(96, 96, 3))
    #   # conv_base.trainable = False
    #   # hidden = conv_base(hidden)    # shape=(50, 3, 3, 1280)
    #   hidden = tf.layers.conv2d(hidden, 32, 8, **kwargs)
    #   hidden = tf.layers.conv2d(hidden, 64, 5, **kwargs)
    #   hidden = tf.layers.conv2d(hidden, 72, 5, **kwargs)
    #   hidden = tf.layers.conv2d(hidden, 128, 5, **kwargs)
    #
    #   hidden = tf.layers.conv2d(hidden, 1024, 3, strides=(1, 1), activation=tf.nn.elu)
    #   hidden = tf.layers.flatten(hidden)
    #   hidden = tf.layers.Dense(1024, tf.nn.elu)(hidden)
    #   # hidden = tf.layers.Dense(hidden, 1024, tf.nn.elu)

    elif obs_size == (128, 128):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 32, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2)

    hidden = tf.layers.flatten(hidden)
    assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list()
    hidden = tf.reshape(
        hidden,
        tools.shape(obs['image'])[:2] + [np.prod(hidden.shape[1:].as_list())])
    return hidden  # shape(50,50,1024)
示例#6
0
def sample_pair(batch):
    num_sam = tools.shape(batch)[0]
    index = tf.range(num_sam)
    tgt1 = tf.slice(batch, [0, 1], [num_sam, 1])
    pred1 = tf.slice(batch, [0, 0], [num_sam, 1])

    def uniform():
        batch2 = tf.gather(batch, tf.random.shuffle(index))
        pred2 = tf.slice(batch2, [0, 0], [num_sam, 1])
        tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1])
        return pred1, pred2, tgt1, tgt2

    return uniform
示例#7
0
def compute_error_loss(pred1, pred2, tgt1, tgt2, hard_ratio=1.0):
    geq = tf.cast((tgt1 - tgt2) > 0, tf.bool)
    tgt_larg = tf.where(geq, tgt1, tgt2)
    tgt_small = tf.where(geq, tgt2, tgt1)
    pred_larg = tf.where(geq, pred1, pred2)
    pred_small = tf.where(geq, pred2, pred1)
    loss = tf.maximum(0., (tgt_larg - tgt_small) - (pred_larg - pred_small))
    if hard_ratio < 1.0:
        hard_num = tf.cast(tools.shape(pred1)[0] * hard_ratio, tf.int32)
        loss = tf.reshape(loss, [-1])
        hard_loss, _ = tf.math.top_k(loss, k=hard_num)
        return hard_loss
    return loss
示例#8
0
def feed_forward(
    state, data_shape, num_layers=3, activation=None, cut_gradient=False):
  """Create a model returning unnormalized MSE distribution."""
  hidden = state
  if cut_gradient:
    hidden = tf.stop_gradient(hidden)
  for _ in range(num_layers):
    hidden = tf.layers.dense(hidden, 100, tf.nn.relu)                    # e.g. state:shape(40,50,1,230)-->hidden:shape(40,50,1,100)
  mean = tf.layers.dense(hidden, int(np.prod(data_shape)), activation)   # e.g. --> mean:shape(40,50,1,1)
  mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)          # e.g. mean:shape(40,50,1,1)
  dist = tools.MSEDistribution(mean)
  dist = tfd.Independent(dist, len(data_shape))
  return dist
示例#9
0
def cross_entropy_method_dual2(cell,
                               objective_fn,
                               state,
                               all_actions,
                               all_reward,
                               obs_shape,
                               action_shape,
                               horizon,
                               graph,
                               logdir,
                               task,
                               amount=1000,
                               topk=100,
                               iterations=10,
                               min_action=-1,
                               max_action=1,
                               eval_ratio=0.05,
                               env_state=None):
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)

    def iteration(rival_actions):
        rival_actions = tf.squeeze(rival_actions)
        (_, state), _ = tf.nn.dynamic_rnn(cell,
                                          (0 * obs, rival_actions, use_obs),
                                          initial_state=initial_state)
        return_ = objective_fn(state)
        return_ = tf.reshape(return_, (original_batch, amount, 1))
        return return_

    all_pred = tf.map_fn(iteration, all_actions, parallel_iterations=1)
    triple = tf.concat([all_reward, all_pred], 3)
    return triple
示例#10
0
def decoder(state, data_shape):
  """Compute the data distribution of an observation from its state."""
  #hidden = keras.layers.Dense(500, activation='relu')(state)
  #hidden = keras.layers.Dense(500, activation='relu')(hidden)
  #hidden = keras.layers.Dense(26)(hidden)
  hidden = tf.layers.dense(state, 500, tf.nn.relu)
  hidden = tf.layers.dense(hidden, 500, tf.nn.relu)
  hidden = tf.layers.dense(hidden, 26, None)
  mean = hidden
  mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)
  dist = tools.MSEDistribution(mean)
  dist = tfd.Independent(dist, len(data_shape))
  return dist
示例#11
0
def encoder(obs):
    """Extract deterministic features from an observation."""
    kwargs = dict(strides=2, activation=tf.nn.relu)
    hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list())
    hidden = tf.compat.v1.layers.conv2d(hidden, 32, 4, **kwargs)
    hidden = tf.compat.v1.layers.conv2d(hidden, 64, 4, **kwargs)
    hidden = tf.compat.v1.layers.conv2d(hidden, 128, 4, **kwargs)
    hidden = tf.compat.v1.layers.conv2d(hidden, 256, 4, **kwargs)
    hidden = tf.compat.v1.layers.flatten(hidden)
    assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list()
    hidden = tf.reshape(hidden, tools.shape(obs['image'])[:2] + [
        np.prod(hidden.shape[1:].as_list())])
    return hidden
示例#12
0
def contra_step_lossV5(pred, tgt, resample=1):
    # p = tf.print('begin loss v5', [resample, pred.shape,tgt.shape])
    # with tf.control_dependencies([p]):
    pred_flat = tf.reshape(pred, [-1])
    tgt_flat = tf.reshape(tgt, [-1])
    batch = tf.stack([pred_flat, tgt_flat], 1)
    num_sam = tools.shape(batch)[0]
    index = tf.range(num_sam)
    divider = tf.constant(resample, dtype=tf.float32)

    def sample_compute(cur_loss, i):
        batch1 = tf.gather(batch, tf.random.shuffle(index))
        batch2 = tf.gather(batch, tf.random.shuffle(index))
        pred1 = tf.slice(batch1, [0, 0], [num_sam, 1])
        pred2 = tf.slice(batch2, [0, 0], [num_sam, 1])
        tgt1 = tf.slice(batch1, [0, 1], [num_sam, 1])
        tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1])
        loss = cur_loss + compute_contra_loss(pred1, pred2, tgt1, tgt2)
        print(loss)
        return (loss, i + 1)

    # def sample_compute(i):
    #     batch1 = tf.gather(batch, tf.random.shuffle(index))
    #     batch2 = tf.gather(batch, tf.random.shuffle(index))
    #     pred1 = tf.slice(batch1, [0, 0], [num_sam, 1])
    #     pred2 = tf.slice(batch2, [0, 0], [num_sam, 1])
    #     tgt1 = tf.slice(batch1, [0, 1], [num_sam, 1])
    #     tgt2 = tf.slice(batch2, [0, 1], [num_sam, 1])
    #     loss = compute_contra_loss(pred1, pred2, tgt1, tgt2)
    #     print(loss)
    #     return loss

    i = tf.constant(0)
    loss = tf.constant(0.)
    final_loss = tf.while_loop(lambda l, i: i < resample, sample_compute,
                               [loss, i])[0]
    # final_loss = tf.scan(sample_compute, tf.range(resample), loss)[-1]
    # final_loss = tf.map_fn(fn=lambda inp: sample_compute(inp), elems= tf.range(resample), dtype=tf.float32, parallel_iterations=1)
    # print('final', final_loss)
    # final_loss = loss
    avg_loss = tf.reduce_mean(final_loss) / divider
    # p = tf.print('cur_loss', [final_loss, avg_loss])
    # with tf.control_dependencies([p]):
    #     avg_loss = tf.identity(avg_loss)
    # print(final_loss, avg_loss)
    # p = tf.print('debug loss ', [final_loss, avg_loss])
    # with tf.control_dependencies([p]):
    #     avg_loss = 1. * avg_loss
    # print(avg_loss)
    # exit()
    return avg_loss
示例#13
0
def compute_objectives(posterior, prior, target, graph, config):
    raw_features = graph.cell.features_from_state(posterior)
    heads = graph.heads
    objectives = []
    for name, scale in config.loss_scales.items():
        if config.loss_scales[name] == 0.0:
            continue
        if name in config.heads and name not in config.gradient_heads:
            features = tf.stop_gradient(raw_features)
            include = r'.*/head_{}/.*'.format(name)
            exclude = None
        else:
            features = raw_features
            include = r'.*'
            exclude = None

        if name == 'divergence':
            loss = graph.cell.divergence_from_states(posterior, prior)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('divergence', loss, min, include, exclude))

        elif name == 'overshooting':
            shape = tools.shape(graph.data['action'])
            length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
            _, priors, posteriors, mask = tools.overshooting(
                graph.cell, {}, graph.embedded, graph.data['action'], length,
                config.overshooting_distance, posterior)
            posteriors, priors, mask = tools.nested.map(
                lambda x: x[:, :, 1:-1], (posteriors, priors, mask))
            if config.os_stop_posterior_grad:
                posteriors = tools.nested.map(tf.stop_gradient, posteriors)
            loss = graph.cell.divergence_from_states(posteriors, priors)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('overshooting', loss, min, include, exclude))

        else:
            if name == 'image':
                logprob = heads[name](features).log_prob(
                    target[name][:, :, :, :, -3:])
            else:
                logprob = heads[name](features).log_prob(target[name])
            objectives.append(Objective(name, logprob, max, include, exclude))

    objectives = [
        o._replace(value=tf.reduce_mean(o.value)) for o in objectives
    ]
    return objectives
示例#14
0
def decoder(state, data_shape):
    """Compute the data distribution of an observation from its state."""
    kwargs = dict(strides=2, activation=tf.nn.relu)
    hidden = tf.compat.v1.layers.dense(state, 1024, None)
    hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1]])
    hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 128, 5, **kwargs)
    hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 64, 5, **kwargs)
    hidden = tf.compat.v1.layers.conv2d_transpose(hidden, 32, 6, **kwargs)
    mean = tf.compat.v1.layers.conv2d_transpose(hidden, 3, 6, strides=2)
    assert mean.shape[1:].as_list() == [64, 64, 3], mean.shape
    mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)
    dist = tfd.Normal(mean, 1.0)
    dist = tfd.Independent(dist, len(data_shape))
    return dist
示例#15
0
def feed_forward(state,
                 data_shape,
                 num_layers=2,
                 activation=None,
                 cut_gradient=False):
    """Create a model returning unnormalized MSE distribution."""
    hidden = state
    if cut_gradient:
        hidden = tf.stop_gradient(hidden)
    for _ in range(num_layers):
        hidden = tf.layers.dense(hidden, 100, tf.nn.relu)
    mean = tf.layers.dense(hidden, int(np.prod(data_shape)), activation)
    mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)
    dist = tfd.Normal(mean, 1.0)
    dist = tfd.Independent(dist, len(data_shape))
    return dist
示例#16
0
def encoder(obs):
    """Extract deterministic features from an observation."""
    kwargs2 = dict(strides=2, activation=tf.nn.relu)
    kwargs3 = dict(strides=3, activation=tf.nn.relu)
    kwargs1 = dict(strides=1, activation=tf.nn.relu)
    hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list()
                        )  # (50,50,64,64,3) reshape to (2500,64,64,3)

    if obs_size == (32, 32):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs1)

    elif obs_size == (64, 64):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2)

    # elif obs_size == (128,128):
    #   hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs3)
    #   hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs2)
    #   hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs3)
    #   hidden = tf.layers.conv2d(hidden, 256, 4, **kwargs2)

    elif obs_size == (128, 128):
        hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 32, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 64, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 128, 3, **kwargs1)
        hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2)
        hidden = tf.layers.conv2d(hidden, 256, 3, **kwargs2)

    hidden = tf.layers.flatten(hidden)
    assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list()
    hidden = tf.reshape(
        hidden,
        tools.shape(obs['image'])[:2] + [np.prod(hidden.shape[1:].as_list())])
    return hidden  # shape(50,50,1024)
示例#17
0
def cpc(context,
        graph,
        posterior,
        predict_terms=3,
        negative_samples=5,
        hard_negative_samples=0,
        stack_actions=False,
        negative_actions=False,
        cpc_openloop=False,
        gradient_penalty=False,
        gpenalty_mode=0):
    """
    :param context: shape = (batch_size, chunk_length, context_size)
    :param embedding: shape = (batch_size, chunk_length, embedding_size)
    :param gpenalty_mode: 0 is concatenated, 1 is separate, 2 is three terms
    :return: cross entropy loss
    """
    # x, preds, y_true
    effective_horizon = context.shape[1].value - predict_terms
    embedding = graph.embedded
    actions = graph.data['action']
    if cpc_openloop:
        shape = tools.shape(actions)
        length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
        context_to_use = get_overshoot_preds(graph, embedding, actions, length,
                                             predict_terms, posterior)
        context_to_use = merge_first_two_dim(
            context_to_use)  # shape = N x predict_terms x sample_size
        if negative_actions:
            context_to_use = context_to_use[:, :, None]
            for _ in range(negative_samples):
                random_actions = tf.random.uniform(actions.shape,
                                                   minval=-1,
                                                   maxval=1)
                negative_context = get_overshoot_preds(graph, embedding,
                                                       random_actions, length,
                                                       predict_terms,
                                                       posterior)
                negative_context = merge_first_two_dim(negative_context)[:, :,
                                                                         None]
                context_to_use = tf.concat([context_to_use, negative_context],
                                           axis=2)

    else:
        context_to_use = context[:, :-predict_terms, :]
        context_to_use = tf.reshape(context_to_use,
                                    [-1] + context_to_use.shape[2:].as_list())
        if stack_actions:
            future_actions = tf.stack(
                [
                    tf.reshape(actions[:, i:i + predict_terms],
                               (actions.shape[0].value, -1))
                    for i in range(effective_horizon)
                ],
                axis=1
            )  # batch x effective_horizon x (predict_terms * action_dim)
            assert future_actions.shape[1].value == effective_horizon
            future_actions = merge_first_two_dim(future_actions)
            future_actions = tf.layers.dense(future_actions,
                                             units=256,
                                             activation='relu')
            future_actions = tf.layers.dense(future_actions,
                                             units=256,
                                             activation='relu')
            future_actions = tf.layers.dense(
                future_actions, units=30,
                activation='linear')  # 30 is the size of the state space

            if negative_actions:
                image_context = context_to_use
                context_to_use = tf.concat([image_context, future_actions],
                                           axis=-1)[:, None]
                for _ in range(negative_samples):
                    current_context = tf.concat([
                        image_context,
                        tf.random.uniform(
                            future_actions.shape, minval=-1, maxval=1)
                    ],
                                                axis=-1)
                    context_to_use = tf.concat(
                        [context_to_use, current_context[:, None]],
                        axis=1)  # shape (N x negatives x action_dim)
            else:
                context_to_use = tf.concat([context_to_use, future_actions],
                                           axis=-1)

    reward = graph.data['reward'][:, :, None]
    x, y_true = format_cpc_data(context_to_use,
                                embedding,
                                predict_terms,
                                negative_samples,
                                num_hard_negatives=hard_negative_samples,
                                negative_actions=negative_actions)
    _, reward_y_true = format_cpc_data(context_to_use,
                                       reward,
                                       predict_terms,
                                       negative_samples,
                                       negative_actions=negative_actions)

    code_size = embedding.shape[-1].value

    if cpc_openloop:
        preds = network_prediction_openloop(x, code_size, predict_terms)
        reward_preds = network_prediction_openloop(x,
                                                   1,
                                                   predict_terms,
                                                   name='reward')
    else:
        preds, kernels = network_prediction(x, code_size, predict_terms)
        reward_preds, _ = network_prediction(x,
                                             1,
                                             predict_terms,
                                             name='reward')

    if negative_actions:
        logits = cpc_layer(y_true, preds)
        reward_logits = cpc_layer(reward_y_true, reward_preds)
    else:
        logits = cpc_layer(preds, y_true)
        reward_logits = cpc_layer(reward_preds, reward_y_true)

    labels_zero = tf.zeros(dtype=tf.float32,
                           shape=(x.shape[0], predict_terms, negative_samples))
    labels_one = tf.ones(dtype=tf.float32,
                         shape=(x.shape[0], predict_terms, 1))
    labels = tf.concat([labels_one, labels_zero], axis=-1)

    loss = cross_entropy_loss(labels, logits)
    acc = calc_acc(labels, logits)

    reward_loss = cross_entropy_loss(labels, reward_logits)
    reward_acc = calc_acc(labels, reward_logits)

    if gradient_penalty:
        gpenalty = tf.constant(0, dtype=tf.float32)

        # for i in range(predict_terms):
        #     for j in range(negative_samples + 1):
        #         grad = tf.gradients(logits[:, i, j], [x, y_true])
        #         grad_concat = tf.concat([tf.contrib.layers.flatten(grad[0]),
        #                                  tf.contrib.layers.flatten(grad[1][:, i, j])],
        #                                 axis=-1)
        #         gpenalty += tf.reduce_mean(tf.pow(tf.norm(grad_concat, axis=-1) - 1, 2))

        batch_size, horizon = graph.data['reward'].shape.as_list()
        effective_horizon = horizon - predict_terms
        f = tf.reshape(logits[:, :, 0],
                       shape=(batch_size, effective_horizon, predict_terms))
        s_t = x
        o_tpk = graph.data['image']

        counter = 0
        for k in range(predict_terms):
            matrix = kernels[k]
            z_tk = merge_first_two_dim(embedding[:, 1 + k:1 + k +
                                                 effective_horizon])
            wk_d_ct = tf.transpose(
                tf.linalg.matmul(matrix, s_t, transpose_b=True))
            wk_d_ztk = tf.transpose(
                tf.linalg.matmul(matrix,
                                 z_tk,
                                 transpose_a=True,
                                 transpose_b=True))
            grad = tf.concat([wk_d_ct, wk_d_ztk], axis=-1)
            if gpenalty_mode == 0:
                gpenalty += tf.reduce_mean(
                    tf.reduce_sum(tf.square(wk_d_ct), axis=-1))
                gpenalty += tf.reduce_mean(
                    tf.reduce_sum(tf.square(wk_d_ztk), axis=-1))
            elif gpenalty_mode == 1:
                gpenalty += tf.reduce_mean(
                    tf.pow(tf.norm(grad, axis=-1) - 1, 2))
            elif gpenalty_mode == 2:
                gpenalty += tf.reduce_mean(tf.square(matrix))
            else:
                print("gpenalty mode not supported!")

        # gpenalty /= predict_terms
        if gpenalty_mode == 2:
            gpenalty += tf.reduce_mean(tf.square(s_t))
            gpenalty += tf.reduce_mean(tf.square(embedding))

        return loss, acc, reward_loss, reward_acc, gpenalty, kernels

    return loss, acc, reward_loss, reward_acc, 0., None if cpc_openloop else kernels
示例#18
0
def cross_entropy_method(cell,
                         objective_fn,
                         state,
                         obs_shape,
                         action_shape,
                         horizon,
                         amount=1000,
                         topk=100,
                         iterations=10,
                         discount=0.99,
                         min_action=-1,
                         max_action=1,
                         command=1):
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)
    length = tf.ones([extended_batch], dtype=tf.int32) * horizon

    def iteration(mean_and_stddev, _):
        mean, stddev, command = mean_and_stddev
        # mean 1 12 2
        # stddev 1 12 2
        # Sample action proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions.
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs),
                                          initial_state=initial_state)
        # action
        # Tensor(
        #     "graph/collection/should_collect_carla/simulate-1/train-carla-cem-12/scan/while/simulate/scan/while/Reshape:0",
        #     shape=(1000, 12, 2), dtype=float32)
        reward = objective_fn(state)
        bond_turn = tf.reshape(tf.reduce_sum(action[:, :, 1], axis=1),
                               [1, 1000])
        bond_turn = tf.clip_by_value(bond_turn, -10, 10)
        bond_keep = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1),
                               [1, 1000])
        bond_straight = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1), [1, 1000]) - \
                        0.2*tf.reshape(tf.reduce_sum(tf.abs(action[:, :, 1]), axis=1), [1, 1000])
        bond_straight = tf.clip_by_value(bond_straight, -8, 8)
        bond_keep = tf.clip_by_value(bond_keep, -8, 8)

        def f1():
            return bond_straight  # go straight bond

        def f2():
            return bond_turn + 0.2 * bond_keep  # right turn bond

        def f3():
            return -bond_turn + 0.2 * bond_keep  # left turn bond

        def f4():
            return bond_keep  # lane keep bond

        bond = tf.case(
            {
                tf.reduce_all(tf.equal(command, 2)): f2,
                tf.reduce_all(tf.equal(command, 3)): f3,
                tf.reduce_all(tf.equal(command, 4)): f4
            },
            default=f1,
            exclusive=True)

        return_ = discounted_return.discounted_return(reward, length,
                                                      discount)[:, 0]
        return_ = tf.reshape(return_, (original_batch, amount))
        if PLAN_BOND:
            return_ += bond * 0.2
        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev, command

    mean = tf.zeros((original_batch, horizon) + action_shape)

    # print('>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<\n'*10)

    def f1():
        x = tf.concat([mean[:, :, 0] + 0.6, mean[:, :, 1]], 0)
        return tf.expand_dims(tf.transpose(x), 0)

    def f2():
        x = tf.concat([mean[:, :, 0] + 0.3, mean[:, :, 1] + 0.3], 0)
        return tf.expand_dims(tf.transpose(x), 0)

    def f3():
        x = tf.concat([mean[:, :, 0] + 0.3, mean[:, :, 1] - 0.3], 0)
        return tf.expand_dims(tf.transpose(x), 0)

    command = tf.reshape(command, (1, -1))
    if PLAN_BIAS:
        mean = tf.case(
            {
                tf.reduce_all(tf.equal(command, 2)): f2,
                tf.reduce_all(tf.equal(command, 3)): f3
            },
            default=f1,
            exclusive=True)

    stddev = tf.ones((original_batch, horizon) + action_shape)

    mean, stddev, command = tf.scan(
        iteration,
        tf.range(iterations),
        (mean, stddev, command * tf.ones([1, 12, 2], dtype=tf.float32)),
        back_prop=False)
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean
示例#19
0
def cross_entropy_method(cell,
                         objective_fn,
                         state,
                         obs_shape,
                         action_shape,
                         horizon,
                         amount=1000,
                         topk=100,
                         iterations=10,
                         discount=0.99,
                         min_action=-1,
                         max_action=1,
                         discrete_action=False):
    # Embedded observation and action shapes without batch dim.
    # In Atari case `action_shape` is number of discrete actions
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    # Flatten state dict to get first element and then get envs batch size
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    # It multiplies state's batch size `amount` times so each candidate starts in the same
    # (batched) env. It means that we spawn `amount` candidates for each env in batch
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    # Again, logic to get state's batch size, but this time the extended one
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    # Any candidate don't use observation at any sequence step (it's open loop simulation)
    use_obs = tf.zeros([extended_batch, horizon, 1],
                       tf.bool)  # [batch, sequence, 1]
    obs = tf.zeros((extended_batch, horizon) + obs_shape)
    length = tf.ones([extended_batch], dtype=tf.int32) * horizon

    def iteration(mean_and_stddev, _):
        mean, stddev = mean_and_stddev
        # Sample action proposals from belief for each env in batch, candidate and horizon step
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        # Action shape: (envs batch size, candidates amount, horizon) + action_shape
        action = normal * stddev[:, None] + mean[:, None]
        # Reshape to extended_batch format (original_batch * amount, horizon) + action_shape
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        if discrete_action:
            # Normalize action scores
            action = tf.nn.l2_normalize(action, axis=-1)
            # Apply greedy policy
            postproc_action = greedy(action, action_shape[0])
        else:
            # Clip action to valid range
            action = tf.clip_by_value(action, min_action, max_action)
            # Keep continuous actions
            postproc_action = action
        # Evaluate proposal actions
        (_, state), _ = tf.nn.dynamic_rnn(cell,
                                          (0 * obs, postproc_action, use_obs),
                                          initial_state=initial_state)
        reward = objective_fn(state)
        return_ = discounted_return.discounted_return(reward, length,
                                                      discount)[:, 0]
        # Reshape back to (envs batch size, candidates amount) format
        return_ = tf.reshape(return_, (original_batch, amount))
        # Indices have shape (envs batch size, topk) and those are candidates indices
        # for each env in the batch!
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        # Offset each index so it matches indices of action which has `extended_batch` first dim.
        indices += tf.range(original_batch)[:, None] * amount
        # best_actions have shape indices.shape + action.shape[1:], which is
        # (envs batch size, topk, horizon) + action_shape
        best_actions = tf.gather(action, indices)
        # Calculate new belief from best actions, shape: (envs batch size, horizon) + action_shape
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev

    # Initialize belief over actions (zero mean and unit variance)
    mean = tf.zeros((original_batch, horizon) + action_shape)
    stddev = tf.ones((original_batch, horizon) + action_shape)
    # Run optimisation
    mean, _ = tf.scan(iteration,
                      tf.range(iterations), (mean, stddev),
                      back_prop=False)
    # Select belief at last iterations
    mean = mean[-1]
    # Take only first action, shape: (envs batch size,) + action_shape
    mean = mean[:, 0]
    if discrete_action:
        # Apply greedy policy
        return greedy(mean, action_shape[0])
    else:
        # Return continuous actions
        return mean
示例#20
0
def decoder(state, data_shape):
    """Compute the data distribution of an observation from its state."""
    kwargs2 = dict(strides=2, activation=tf.nn.relu)
    kwargs3 = dict(strides=3, activation=tf.nn.relu)
    kwargs1 = dict(strides=1, activation=tf.nn.relu)
    hidden = tf.layers.dense(state, 1024, None)
    hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1].value])

    if obs_size == (32, 32):
        hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden, 64, 4, **kwargs2)
        hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden,
                                            num_channels_x,
                                            4,
                                            strides=2)

    elif obs_size == (64, 64):
        hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs2)
        hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs2)
        hidden = tf.layers.conv2d_transpose(hidden, 32, 6, **kwargs2)
        hidden = tf.layers.conv2d_transpose(hidden,
                                            num_channels_x,
                                            6,
                                            strides=2)

    # elif obs_size == (128,128):
    #   hidden = tf.layers.conv2d_transpose(hidden, 128, 6, **kwargs2)
    #   hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs3)
    #   hidden = tf.layers.conv2d_transpose(hidden, 32, 5, **kwargs3)
    #   hidden = tf.layers.conv2d_transpose(hidden, num_channels_x, 6, strides=2)

    elif obs_size == (128, 128):
        hidden = tf.layers.conv2d_transpose(hidden, 256, 4, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden, 256, 4, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden, 128, 4, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden, 128, 3, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden, 64, 4, **kwargs2)
        hidden = tf.layers.conv2d_transpose(
            hidden, 64, 4, **kwargs1)  # ~=  pixels * stride + kernel_size
        hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs2)
        hidden = tf.layers.conv2d_transpose(hidden, 32, 4, **kwargs1)
        hidden = tf.layers.conv2d_transpose(hidden,
                                            num_channels_x,
                                            4,
                                            strides=2)

    mean = hidden

    if obs_size == (32, 32):
        assert mean.shape[1:].as_list() == [32, 32, num_channels_x], mean.shape
    elif obs_size == (64, 64):
        assert mean.shape[1:].as_list() == [64, 64, num_channels_x], mean.shape
    elif obs_size == (128, 128):
        assert mean.shape[1:].as_list() == [128, 128,
                                            num_channels_x], mean.shape

    mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)
    dist = tools.MSEDistribution(mean)
    dist = tfd.Independent(dist, len(data_shape))
    return dist
示例#21
0
def cross_entropy_method_dual1(cell,
                               objective_fn,
                               state,
                               obs_shape,
                               action_shape,
                               horizon,
                               graph,
                               logdir,
                               task,
                               amount=1000,
                               topk=100,
                               iterations=10,
                               min_action=-1,
                               max_action=1,
                               eval_ratio=0.05,
                               env_state=None):
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)

    def gd_eval(state, actions, preds, logdir):
        # env_ctor = functools.partial(create_env, domain='cheetah', task='run', repeat=repeat)
        from planet.control import wrappers
        repeat = wrappers.get_repeat(task_str=task[0])
        env_ctor = functools.partial(create_simple_env,
                                     task_str=task[0],
                                     repeat=repeat)
        evaluator = AsyncEvaluator(env_ctor, 30)
        name = 'cem_traj'
        path = os.path.join(logdir, name + '.npy')

        def eval(state, actions, preds):
            if evaluator.isclosed():
                print('evaluator closed, reopen now')
                evaluator.reopen(env_ctor, 10)
            promises = []
            for act in actions:
                promise = evaluator(state, act)
                promises.append(promise)
            gds = [promise() for promise in promises]
            # gds = np.stack(gds, 0).astype(np.float32)
            gds = np.array(gds).astype(np.float32)
            batch = np.stack((gds, preds), 1)
            batch = np.expand_dims(batch, 0)
            if os.path.exists(path):
                prev = np.load(path)
                new = np.concatenate((prev, batch), 0)
            else:
                new = batch
            if new.shape[0] > 1000 * 10 / repeat - 300:
                print('end of task, close it')
                evaluator.close()
                # exit()
            # batch = np.sum(batch, -2)
            print(new.shape)
            np.save(path, new)
            return batch

        all_reward = tf.py_func(eval,
                                inp=[state, actions, preds],
                                Tout=tf.float32)
        return all_reward

    def iteration(mean_and_stddev, id):
        mean, stddev, collect, _, _ = mean_and_stddev
        # Sample action proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        all_action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions by latent imagination
        action = tf.reshape(all_action,
                            (extended_batch, horizon) + action_shape)
        (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs),
                                          initial_state=initial_state)
        return_ = objective_fn(state)

        all_reward = tf.cond(
            collect, lambda: gd_eval(env_state, action, return_, logdir),
            lambda: tf.zeros((original_batch, amount, 2)))

        all_reward = tf.reshape(all_reward, (original_batch, amount, 2))
        with tf.control_dependencies([all_reward]):
            return_ = tf.reshape(return_, (original_batch, amount))
        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev, collect, all_action, all_reward

    mean = tf.zeros((original_batch, horizon) + action_shape)
    stddev = tf.ones((original_batch, horizon) + action_shape)
    all_action = tf.zeros((original_batch, amount, horizon) + action_shape)
    all_reward = tf.zeros((original_batch, amount, 2))

    if iterations < 1:
        return mean

    collect = tf.cond(
        tf.random_uniform((), dtype=tf.float32) < eval_ratio, lambda: tf.ones(
            (), tf.bool), lambda: tf.zeros((), tf.bool))
    a = tf.print(collect)
    with tf.control_dependencies([a]):
        mean, stddev, collect, all_action, all_reward = tf.scan(
            iteration,
            tf.range(iterations),
            (mean, stddev, collect, all_action, all_reward),
            back_prop=False)
    # print(mean, stddev, collect, all_action, all_reward )
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean, collect, all_action, all_reward
示例#22
0
def simulator_planner(cell,
                      objective_fn,
                      state,
                      obs_shape,
                      action_shape,
                      horizon,
                      graph,
                      logdir,
                      task,
                      amount=1000,
                      topk=100,
                      iterations=10,
                      min_action=-1,
                      max_action=1,
                      eval_ratio=0.05,
                      env_state=None):
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(tools.nested.flatten(state)[0])[0]
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)

    def gd_eval(state, actions):
        from planet.control import wrappers
        repeat = wrappers.get_repeat(task_str=task[0])
        env_ctor = functools.partial(create_simple_env,
                                     task_str=task[0],
                                     repeat=repeat)
        evaluator = AsyncEvaluator(env_ctor, 30)

        def eval(state, actions):
            if evaluator.isclosed():
                print('evaluator closed, reopen now')
                evaluator.reopen(env_ctor, 10)
            promises = []
            for act in actions:
                promise = evaluator(state, act)
                promises.append(promise)
            gds = [promise() for promise in promises]
            gds = np.stack(gds, 0).astype(np.float32)
            # print(gds.shape)
            return gds

        op = tf.py_func(eval, inp=[state, actions], Tout=tf.float32)
        return op

    def iteration(mean_and_stddev, id):

        mean, stddev, collect = mean_and_stddev
        # Sample action proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions by latent imagination
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        # reward = objective_fn(state)
        reward = gd_eval(env_state, action)
        # simu_eval = gd_eval(env_state, action, reward, logdir)
        return_ = tf.reduce_sum(reward, 1)
        return_ = tf.reshape(return_, (original_batch, amount))
        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(return_, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance)
        return mean, stddev, collect

    mean = tf.zeros((original_batch, horizon) + action_shape)
    stddev = tf.ones((original_batch, horizon) + action_shape)

    if iterations < 1:
        return mean
    collect = tf.cond(
        tf.random_uniform((), dtype=tf.float32) < eval_ratio, lambda: tf.ones(
            (), tf.bool), lambda: tf.zeros((), tf.bool))

    mean, stddev, _ = tf.scan(iteration,
                              tf.range(iterations), (mean, stddev, collect),
                              back_prop=False)
    # a = tf.print('fffff ', env_state, mean, stddev)
    # with tf.control_dependencies([a]):
    #    mean = tf.identity(mean)
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean
示例#23
0
def compute_objectives(posterior, prior, target, graph, config, trainer):
    raw_features = graph.cell.features_from_state(posterior)
    heads = graph.heads
    objectives = []
    summaries = []
    cstr_pct = 0.0
    for name, scale in config.loss_scales.items():
        if config.loss_scales[name] == 0.0:
            continue
        if name in config.heads and name not in config.gradient_heads:
            features = tf.stop_gradient(raw_features)
            include = r'.*/head_{}/.*'.format(name)
            exclude = None
        else:
            features = raw_features
            include = r'.*'
            exclude = None

        if name == 'divergence':
            loss = graph.cell.divergence_from_states(posterior, prior)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('divergence', loss, min, include, exclude))

        elif name == 'overshooting':
            shape = tools.shape(graph.data['action'])
            length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
            _, priors, posteriors, mask = tools.overshooting(
                graph.cell, {}, graph.embedded, graph.data['action'], length,
                config.overshooting_distance, posterior)
            posteriors, priors, mask = tools.nested.map(
                lambda x: x[:, :, 1:-1], (posteriors, priors, mask))
            if config.os_stop_posterior_grad:
                posteriors = tools.nested.map(tf.stop_gradient, posteriors)
            loss = graph.cell.divergence_from_states(posteriors, priors)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('overshooting', loss, min, include, exclude))

        elif name == 'reward' and config.r_loss == 'contra':
            pred = heads[name](features)
            if config.contra_unit == 'traj':
                print('Using traj loss')
                contra_loss, cstr_pct = contra_traj_lossV6(
                    pred, target[name], horizon=config.contra_horizon)
            elif config.contra_unit == 'weighted':
                print('Using weighted trajectory loss ', config.contra_horizon)
                contra_loss, cstr_pct = contra_traj_lossV7(
                    pred,
                    target[name],
                    horizon=config.contra_horizon,
                    temp=config.temp)
            elif config.contra_unit == 'simclr':
                print('Using simclr trajectory loss ', config.contra_horizon)
                contra_loss, cstr_pct = contra_traj_lossV8(
                    pred, target[name], horizon=config.contra_horizon)
            elif config.contra_unit == 'rank':
                print('Using ranking trajectory loss ', config.contra_horizon)
                contra_loss, cstr_pct = contra_traj_lossV9(
                    pred,
                    target[name],
                    horizon=config.contra_horizon,
                    margin=config.margin)

            objectives.append((Objective(name, contra_loss, min, include,
                                         exclude)))
        elif name == 'reward' and config.r_loss == 'l2':
            pred = heads[name](features)
            l2_loss = tf.compat.v1.losses.mean_squared_error(
                target[name], pred)
            # l2_loss = tf.nn.l2_loss(pred - target[name])
            objectives.append((Objective(name, l2_loss, min, include,
                                         exclude)))
        else:
            if not config.aug_same and config.aug:
                recon_feat = tf.concat([features, target['aug']], -1)
                print('Use recon feature ', name, recon_feat)
                logprob = heads[name](recon_feat).log_prob(target[name])
                # logprob = heads[name](features).log_prob(target['ori_img'])
            else:
                logprob = heads[name](features).log_prob(target[name])
            objectives.append(Objective(name, logprob, max, include, exclude))

    objectives = [
        o._replace(value=tf.reduce_mean(o.value)) for o in objectives
    ]

    return objectives, cstr_pct
示例#24
0
def compute_objectives(posterior, prior, target, graph, config):
    raw_features = graph.cell.features_from_state(posterior)
    heads = graph.heads
    objectives = []
    cpc_logs = {}
    for name, scale in config.loss_scales.items():
        if config.loss_scales[name] == 0.0:
            continue
        if name in config.heads and name not in config.gradient_heads:
            features = tf.stop_gradient(raw_features)
            include = r'.*/head_{}/.*'.format(name)
            exclude = None
        else:
            features = raw_features
            include = r'.*'
            exclude = None

        if name == 'divergence':
            loss = graph.cell.divergence_from_states(posterior, prior)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('divergence', loss, min, include, exclude))

        elif name == 'latent_prior':
            num_actions = 10
            prev_states_flattened = tools.nested.map(
                lambda x: tf.reshape(x, (-1, x.shape[-1].value)), posterior)
            prev_states = tools.nested.map(
                lambda x: tf.tile(x, multiples=(num_actions, 1)),
                prev_states_flattened)
            batch_size = prev_states['sample'].shape[0].value
            prev_action = tf.random.uniform(
                (batch_size, graph.data['action'].shape[-1].value),
                minval=-1,
                maxval=1)
            obs = tf.zeros(shape=[
                batch_size,
            ] + graph.embedded.shape[2:].as_list())
            use_obs = tf.zeros((batch_size, 1), tf.bool)
            (next_states, _), _ = graph.cell((obs, prev_action, use_obs),
                                             prev_states)
            if not config.latent_prior_marginal:
                loss = graph.cell.divergence_from_states(
                    prev_states, next_states)
            else:
                samples_next_state = tf.reshape(
                    next_states['sample'],
                    shape=(batch_size // num_actions, num_actions, -1))
                samples_next_state_mean = tf.reduce_mean(samples_next_state,
                                                         axis=1)
                samples_current_state = tf.stop_gradient(
                    prev_states_flattened['sample'])
                loss = tf.reduce_mean(
                    tf.reduce_sum(tf.square(samples_next_state_mean -
                                            samples_current_state),
                                  axis=-1))
            objectives.append(
                Objective('latent_prior', loss, min, include, exclude))

        elif name == 'embedding_l2':
            loss = tf.reduce_mean(
                tf.reduce_sum(tf.square(graph.embedded), axis=-1))
            objectives.append(
                Objective('embedding_l2', loss, min, include, exclude))

        elif name == 'overshooting':
            shape = tools.shape(graph.data['action'])
            length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
            _, priors, posteriors, mask = tools.overshooting(
                graph.cell, {}, graph.embedded, graph.data['action'], length,
                config.overshooting_distance, posterior)
            posteriors, priors, mask = tools.nested.map(
                lambda x: x[:, :, 1:-1], (posteriors, priors, mask))
            if config.os_stop_posterior_grad:
                posteriors = tools.nested.map(tf.stop_gradient, posteriors)
            loss = graph.cell.divergence_from_states(posteriors, priors)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('overshooting', loss, min, include, exclude))

        elif name == 'cpc':
            loss, acc, reward_loss, reward_acc, gpenalty, kernels = networks.\
              cpc(features if config.include_belief else posterior['sample'], graph, posterior, predict_terms=config.future,
                  negative_samples=config.negatives, hard_negative_samples=config.hard_negatives,
                  stack_actions=config.stack_actions, negative_actions=config.negative_actions,
                  cpc_openloop=config.cpc_openloop, gradient_penalty=config.cpc_gpenalty_scale > 0,
                  gpenalty_mode=config.gpenalty_mode)
            loss += reward_loss * config.cpc_reward_scale
            loss += gpenalty * config.cpc_gpenalty_scale
            objectives.append(Objective('cpc', loss, min, include, exclude))
            cpc_logs['acc'] = acc
            cpc_logs['reward_acc'] = reward_acc
            cpc_logs['gpenalty'] = gpenalty
            if kernels:
                for i in range(config.future):
                    cpc_logs['W_mag%d' % i] = tf.reduce_mean(
                        tf.square(kernels[i]))
        elif name == 'inverse_model':
            loss, acc = networks.inverse_model(
                features,
                graph,
                contrastive=config.action_contrastive,
                negative_samples=config.negatives)
            objectives.append(
                Objective('inverse_model', loss, min, include, exclude))
            if config.action_contrastive:
                cpc_logs['inverse_model_acc'] = acc
        else:
            logprob = heads[name](features).log_prob(target[name])
            objectives.append(Objective(name, logprob, max, include, exclude))

    objectives = [
        o._replace(value=tf.reduce_mean(o.value)) for o in objectives
    ]
    return objectives, cpc_logs
示例#25
0
def cross_entropy_method(
    cell,
    objective_fn,
    state,
    info_cmd,
    obs_shape,
    action_shape,
    horizon,
    amount=1000,
    topk=100,
    iterations=10,
    discount=0.99,
    min_action=-1,
    max_action=1
):  # state,info_cmd: shape(num_envs,4): next_command_id, goal_heading_degree, current_heading_degree,dist_to_intersection
    obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
    original_batch = tools.shape(
        tools.nested.flatten(state)[0])[0]  # original_batch: num_envs
    initial_state = tools.nested.map(
        lambda tensor: tf.tile(tensor, [amount] + [1] *
                               (tensor.shape.ndims - 1)), state)
    extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
    use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
    obs = tf.zeros((extended_batch, horizon) + obs_shape)
    length = tf.ones([extended_batch], dtype=tf.int32) * horizon

    # info_cmd components
    info_cmd = tf.squeeze(info_cmd)  # shape(3,)
    cmd_id, goal_heading_degree, current_heading_degree, dist_to_intersection = info_cmd[
        0], info_cmd[1], info_cmd[2], info_cmd[3]

    def iteration(mean_and_stddev, _):
        mean, stddev = mean_and_stddev
        # Sample action proposals from belief.
        normal = tf.random_normal((original_batch, amount, horizon) +
                                  action_shape)
        action = normal * stddev[:, None] + mean[:, None]
        action = tf.clip_by_value(action, min_action, max_action)
        # Evaluate proposal actions.
        action = tf.reshape(action, (extended_batch, horizon) + action_shape)
        (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs),
                                          initial_state=initial_state)

        # objectives
        objectives = objective_fn(
            state
        )  # shape: ['reward':shape(1000,12), 'angular_speed_degree':shape(1000,12), ...]
        reward = objectives['reward']
        angular_speed = objectives['angular_speed_degree']
        forward_speed = objectives['forward_speed'] / 10.0
        collided = objectives['collided']
        intersection_offroad = objectives['intersection_offroad']
        intersection_otherlane = objectives['intersection_otherlane']

        # #################    #1. define reward for planning
        # return_ = discounted_return.discounted_return(
        #     reward, length, discount)[:, 0]
        # total_return = tf.reshape(return_, (original_batch, amount))

        if not PLANNING:
            ##################    #2. define reward for planning
            return_ = discounted_return.discounted_return(
                reward, length, discount)[:, 0]  # shape: (1000,)
            return_ = tf.reshape(return_,
                                 (original_batch, amount))  # shape: (1, 1000)

            # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0)
            threshold_degree = tf.where(dist_to_intersection < 9,
                                        9 * (9 - dist_to_intersection), 0)
            angular_turn_ = discounted_return.discounted_return(
                angular_speed, length, 1.0)[:, 0]  # shape: (1000,)
            # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0]
            # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1)
            heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \
                           tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0)
            heading_loss_weighted = heading_loss * tf.where(
                heading_loss > threshold_degree - 90,
                tf.ones((amount, )) * 0.3,
                tf.ones((amount, )) *
                1000.0)  # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs
            return_heading = tf.reshape(heading_loss_weighted,
                                        (original_batch, amount))

            total_return = return_ + return_heading  # /90.0*12*4

        if PLANNING:
            ##################    #3. define reward for planning
            rewards = forward_speed - 300.0 * tf.where(
                collided > 0.3, collided,
                tf.ones_like(collided) * 0.0
            ) - 20.0 * intersection_offroad - 10.0 * intersection_otherlane
            return_ = discounted_return.discounted_return(
                rewards, length, discount)[:, 0]  # shape: (1000,)
            return_ = tf.reshape(return_,
                                 (original_batch, amount))  # shape: (1, 1000)

            # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0)
            threshold_degree = tf.where(dist_to_intersection < 9,
                                        9 * (9 - dist_to_intersection), 0)
            angular_turn_ = discounted_return.discounted_return(
                angular_speed, length, 1.0)[:, 0]  # shape: (1000,)
            # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0]
            # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1)
            heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \
                           tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0)
            heading_loss_weighted = heading_loss * tf.where(
                heading_loss > threshold_degree - 90,
                tf.ones((amount, )) * 0.3,
                tf.ones((amount, )) *
                1000.0)  # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs
            return_heading = tf.reshape(heading_loss_weighted,
                                        (original_batch, amount))

            total_return = return_ + return_heading  # /90.0*12*4

        # Re-fit belief to the best ones.
        _, indices = tf.nn.top_k(total_return, topk, sorted=False)
        indices += tf.range(original_batch)[:, None] * amount
        best_actions = tf.gather(action, indices)
        mean, variance = tf.nn.moments(best_actions, 1)
        stddev = tf.sqrt(variance + 1e-6)
        return mean, stddev

    '''COMMAND_ORDINAL = {
    "REACH_GOAL": 0,
    "GO_STRAIGHT": 1,
    "TURN_RIGHT": 2,
    "TURN_LEFT": 3,
    "LANE_FOLLOW": 4 }
  '''
    # compute action_bias
    f_0 = lambda: tf.constant([0.0, 0.0])  # [throttle, steer(l-,r+)]
    f_1eft = lambda: tf.constant([0.0, -0.5])
    f_right = lambda: tf.constant([0.0, 0.5])

    pred_func = {tf.equal(cmd_id, 3): f_1eft, tf.equal(cmd_id, 2): f_right}
    action_bias = tf.case(pred_func, default=f_0)

    # # compute angular clue
    # angular_f_0 = lambda: tf.constant(0.0)    # [throttle, steer(l-,r+)]
    # angular_f_1eft = lambda: tf.constant(-3.0)
    # angular_f_right = lambda: tf.constant(3.0)
    #
    # angular_pred_func = { tf.equal(cmd_id,3):angular_f_1eft, tf.equal(cmd_id,2):angular_f_right, tf.equal(cmd_id,1):angular_f_0 }
    # angular_clue = tf.case(angular_pred_func, default=angular_f_0)

    mean = tf.zeros((original_batch, horizon) + action_shape)  # + action_bias
    stddev = tf.ones((original_batch, horizon) + action_shape)

    mean, stddev = tf.scan(iteration,
                           tf.range(iterations), (mean, stddev),
                           back_prop=False)
    mean, stddev = mean[-1], stddev[-1]  # Select belief at last iterations.
    return mean
示例#26
0
def compute_objectives(posterior, prior, target, graph, config):
    heads = graph.heads
    objectives = []

    for name, scale in config.loss_scales.items():
        features = []

        if config.loss_scales[name] == 0.0:
            continue
        if name in config.heads and name not in config.gradient_heads:
            for mdl in range(len(posterior)):
                raw_features = graph.cell[mdl].features_from_state(
                    posterior[mdl])
                features.append(tf.stop_gradient(raw_features))
            include = r'.*/head_{}/.*'.format(name)
            exclude = None
        else:
            for mdl in range(len(posterior)):
                raw_features = graph.cell[mdl].features_from_state(
                    posterior[mdl])
                features.append(raw_features)
            include = r'.*'
            exclude = None

        if name == 'divergence':
            loss = graph.cell[0].divergence_from_states(posterior[0], prior[0])
            for mdl in range(1, len(posterior)):
                loss = tf.math.add(
                    loss, graph.cell[mdl].divergence_from_states(
                        posterior[mdl], prior[mdl]))
            loss = tf.math.scalar_mul((1 / len(posterior)), loss)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('divergence', loss, min, include, exclude))

        elif name == 'overshooting':
            assert name != 'overshooting'  #Didn't change overshooting to include ensembles
            shape = tools.shape(graph.data['action'])
            length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
            _, priors, posteriors, mask = tools.overshooting(
                graph.cell[mdl], {}, graph.embedded[mdl], graph.data['action'],
                length, config.overshooting_distance, posterior)
            posteriors, priors, mask = tools.nested.map(
                lambda x: x[:, :, 1:-1], (posteriors, priors, mask))
            if config.os_stop_posterior_grad:
                posteriors = tools.nested.map(tf.stop_gradient, posteriors)
            loss = graph.cell[mdl].divergence_from_states(posteriors, priors)
            if config.free_nats is not None:
                loss = tf.maximum(0.0, loss - float(config.free_nats))
            objectives.append(
                Objective('overshooting', loss, min, include, exclude))

        else:
            bootstrap_target = tf.gather(target[name],
                                         graph.sample_with_replacement[0, :],
                                         axis=0)
            logprob = heads[name](features[0]).log_prob(bootstrap_target)
            for mdl in range(1, len(posterior)):
                bootstrap_target = tf.gather(
                    target[name],
                    graph.sample_with_replacement[mdl, :],
                    axis=0)
                logprob = tf.math.add(
                    logprob,
                    heads[name](features[mdl]).log_prob(bootstrap_target))
            logprob = tf.math.scalar_mul((1 / len(posterior)), logprob)
            objectives.append(Objective(name, logprob, max, include, exclude))
    print(objectives)
    objectives = [
        o._replace(value=tf.reduce_mean(o.value)) for o in objectives
    ]
    #assert 1==2
    return objectives