Exemplo n.º 1
0
def imagine_forward(
    initial_state, distance, graph, config, policy,
    stop_grad_post_action=True, stop_grad_pre_action=True, return_actions=False):
  extended_batch = np.prod(tools.shape(
      tools.nested.flatten(initial_state)[0])[:2])
  obs = tf.zeros([extended_batch] + list(graph.embedded.shape[2:]))
  use_obs = tf.zeros([extended_batch, 1], tf.bool)
  new_shape = lambda t: [
      tf.reduce_prod(tools.shape(t)[:2])] + tools.shape(t)[2:]
  initial_state = tools.nested.map(
      lambda tensor: tf.reshape(tensor, new_shape(tensor)),
      initial_state)
  def step_fn(state_action, index):
    prev = state_action[0]
    feature = graph.cell.features_from_state(prev)
    if stop_grad_pre_action:
      feature = tf.stop_gradient(feature)
    action = policy(feature).sample()
    if stop_grad_post_action:
      action = tf.stop_gradient(action)
    (_, state), _ = graph.cell((obs, action, use_obs), prev)
    return [state, action]

  action_shape = graph.data['action'].shape
  dummy_action = tf.zeros([int(action_shape[0]*action_shape[1]), action_shape[2]], dtype=tf.float32)

  res = tf.scan(step_fn, tf.range(distance), [initial_state, dummy_action], back_prop=True)
  states, actions = res[0], res[1]
  states = tools.nested.map(lambda x: tf.transpose(x, [1, 0, 2]), states)
  actions = tools.nested.map(lambda x: tf.transpose(x, [1, 0, 2]), actions)
  if return_actions:
      return states, actions
  else:
      return states
Exemplo n.º 2
0
def one_step_model(state,
                   prev_action,
                   data_shape,
                   model_width_factor,
                   max_objective=False,
                   dist='deterministic'):

    num_layers = 2
    activation = tf.nn.relu
    units = data_shape[0] * model_width_factor
    state = tf.stop_gradient(state)
    prev_action = tf.stop_gradient(prev_action)
    inputs = tf.concat([state, prev_action], -1)
    for _ in range(num_layers):
        hidden = tf.layers.dense(inputs, units, activation)
        inputs = tf.concat([hidden, prev_action], -1)

    mean = tf.layers.dense(inputs, int(np.prod(data_shape)), None)
    mean = tf.reshape(mean, tools.shape(state)[:-1] + data_shape)

    if max_objective:
        min_std = 1e-2
        init_std = 1.0
        std = tf.layers.dense(inputs, int(np.prod(data_shape)), None)
        init_std = np.log(np.exp(init_std) - 1)
        std = tf.nn.softplus(std + init_std) + min_std
        std = tf.reshape(std, tools.shape(state)[:-1] + data_shape)
        dist = tfd.Normal(mean, std)
        dist = tfd.Independent(dist, len(data_shape))
    else:
        dist = tfd.Deterministic(mean)
        dist = tfd.Independent(dist, len(data_shape))

    return dist
Exemplo n.º 3
0
def feed_forward(
    features, data_shape, num_layers=2, activation=tf.nn.relu,
    mean_activation=None, stop_gradient=False, trainable=True, units=100,
    std=1.0, low=-1.0, high=1.0, dist='normal', min_std=1e-2, init_std=1.0):
  hidden = features
  if stop_gradient:
    hidden = tf.stop_gradient(hidden)
  for _ in range(num_layers):
    hidden = tf.layers.dense(hidden, units, activation, trainable=trainable)
  mean = tf.layers.dense(
      hidden, int(np.prod(data_shape)), mean_activation, trainable=trainable)
  mean = tf.reshape(mean, tools.shape(features)[:-1] + data_shape)
  if std == 'learned':
    std = tf.layers.dense(
        hidden, int(np.prod(data_shape)), None, trainable=trainable)
    init_std = np.log(np.exp(init_std) - 1)
    std = tf.nn.softplus(std + init_std) + min_std
    std = tf.reshape(std, tools.shape(features)[:-1] + data_shape)
  if dist == 'normal':
    dist = tfd.Normal(mean, std)
    dist = tfd.Independent(dist, len(data_shape))
  elif dist == 'deterministic':
    dist = tfd.Deterministic(mean)
    dist = tfd.Independent(dist, len(data_shape))
  elif dist == 'binary':
    dist = tfd.Bernoulli(mean)
    dist = tfd.Independent(dist, len(data_shape))
  elif dist == 'trunc_normal':
    # https://www.desmos.com/calculator/rnksmhtgui
    dist = tfd.TruncatedNormal(mean, std, low, high)
    dist = tfd.Independent(dist, len(data_shape))
  elif dist == 'tanh_normal':
    # https://www.desmos.com/calculator/794s8kf0es
    dist = distributions.TanhNormal(mean, std)
  elif dist == 'tanh_normal_tanh':
    # https://www.desmos.com/calculator/794s8kf0es
    mean = 5.0 * tf.tanh(mean / 5.0)
    dist = distributions.TanhNormal(mean, std)
  elif dist == 'onehot_score':
    dist = distributions.OneHot(mean, gradient='score')
  elif dist == 'onehot_straight':
    dist = distributions.OneHot(mean, gradient='straight')
  else:
    raise NotImplementedError(dist)
  return dist
Exemplo n.º 4
0
def decoder(features, data_shape, std=1.0):
  kwargs = dict(strides=2, activation=tf.nn.relu)
  hidden = tf.layers.dense(features, 1024, None)
  hidden = tf.reshape(hidden, [-1, 1, 1, hidden.shape[-1].value])
  hidden = tf.layers.conv2d_transpose(hidden, 128, 5, **kwargs)
  hidden = tf.layers.conv2d_transpose(hidden, 64, 5, **kwargs)
  hidden = tf.layers.conv2d_transpose(hidden, 32, 6, **kwargs)
  mean = tf.layers.conv2d_transpose(hidden, data_shape[-1], 6, strides=2)
  assert mean.shape[1:].as_list() == data_shape, mean.shape
  mean = tf.reshape(mean, tools.shape(features)[:-1] + data_shape)
  return tfd.Independent(tfd.Normal(mean, std), len(data_shape))
Exemplo n.º 5
0
def cross_entropy_method(
    cell, objective, state, obs_shape, action_shape, horizon, graph,
    beams=1000, topk=100, iterations=10, min_action=-1, max_action=1):
  obs_shape, action_shape = tuple(obs_shape), tuple(action_shape)
  batch = tools.shape(tools.nested.flatten(state)[0])[0]
  initial_state = tools.nested.map(lambda tensor: tf.tile(
      tensor, [beams] + [1] * (tensor.shape.ndims - 1)), state)
  extended_batch = tools.shape(tools.nested.flatten(initial_state)[0])[0]
  use_obs = tf.zeros([extended_batch, horizon, 1], tf.bool)
  obs = tf.zeros((extended_batch, horizon) + obs_shape)

  def iteration(index, mean, stddev):
    # Sample action proposals from belief.
    normal = tf.random_normal((batch, beams, horizon) + action_shape)
    action = normal * stddev[:, None] + mean[:, None]
    action = tf.clip_by_value(action, min_action, max_action)
    # Evaluate proposal actions.
    action = tf.reshape(
        action, (extended_batch, horizon) + action_shape)
    (_, state), _ = tf.nn.dynamic_rnn(
        cell, (0 * obs, action, use_obs), initial_state=initial_state)
    return_ = objective(state)
    return_ = tf.reshape(return_, (batch, beams))
    # Re-fit belief to the best ones.
    _, indices = tf.nn.top_k(return_, topk, sorted=False)
    indices += tf.range(batch)[:, None] * beams
    best_actions = tf.gather(action, indices)
    mean, variance = tf.nn.moments(best_actions, 1)
    stddev = tf.sqrt(variance + 1e-6)
    return index + 1, mean, stddev

  mean = tf.zeros((batch, horizon) + action_shape)
  stddev = tf.ones((batch, horizon) + action_shape)
  _, mean, std = tf.while_loop(
      lambda index, mean, stddev: index < iterations, iteration,
      (0, mean, stddev), back_prop=False)
  return mean
Exemplo n.º 6
0
def encoder(obs, encoder_feature_shape):

  sh = 128 if encoder_feature_shape==512 else 256
  kwargs = dict(strides=2, activation=tf.nn.relu)
  hidden = tf.reshape(obs['image'], [-1] + obs['image'].shape[2:].as_list())
  hidden = tf.layers.conv2d(hidden, 32, 4, **kwargs)
  hidden = tf.layers.conv2d(hidden, 64, 4, **kwargs)
  hidden = tf.layers.conv2d(hidden, 128, 4, **kwargs)
  hidden = tf.layers.conv2d(hidden, sh, 4, **kwargs)
  hidden = tf.layers.flatten(hidden)
  if encoder_feature_shape!=512:
      assert hidden.shape[1:].as_list() == [1024], hidden.shape.as_list()
  else:
      assert hidden.shape[1:].as_list() == [512], hidden.shape.as_list()
  hidden = tf.reshape(hidden, tools.shape(obs['image'])[:2] + [
      np.prod(hidden.shape[1:].as_list())])
  return hidden
Exemplo n.º 7
0
def action_head_policy(
    cell, objective, state, obs_shape, action_shape, graph, config, strategy, min_action=-1, max_action=1):
  features = cell.features_from_state(state)
  policy = graph.heads.action(features)
  if strategy == 'sample':
    action = policy.sample()
  elif strategy == 'mode':
    action = policy.mode()
  elif strategy == 'curious_sample':
    curious_policy = graph.heads.curious_action(features)
    action = curious_policy.sample()
  elif strategy == 'random_sample':
    batch = tools.shape(tools.nested.flatten(features)[0])[0]
    mean = tf.zeros((batch,action_shape[0]))
    stddev = tf.ones((batch,action_shape[0]))
    normal = tf.random_normal((batch,action_shape[0]))
    action = normal * stddev + mean
    action = tf.clip_by_value(action, min_action, max_action)
  else:
    raise NotImplementedError(strategy)
  plan = action[:, None, :]
  return plan
Exemplo n.º 8
0
def compute_objectives(posterior, prior, target, graph, config):
  raw_features = graph.cell.features_from_state(posterior)
  heads = graph.heads
  sample_with_replacement = None
  if (config.curious_run and config.bootstrap) or (config.combination_run and config.bootstrap):
      bagging_size = int(1*config.batch_shape[0])
      sample_with_replacement = tf.random.uniform([config.num_models, bagging_size], minval=0, maxval=config.batch_shape[0],
                                                    dtype = tf.dtypes.int32)

  if config.imagination_horizon:
    imagination_start = posterior
    if config.imagination_skip_last:
      imagination_start = tools.nested.map(
          lambda x: x[:, :-config.imagination_skip_last], imagination_start)
    if config.curious_run or config.vanilla_curious_run:
        curious_raw_states, curious_actions = imagine_forward(
            imagination_start, config.exploration_imagination_horizon, graph, config,
            graph.heads.curious_action, stop_grad_post_action=False,
            stop_grad_pre_action=config.stop_grad_pre_action, return_actions=True)
    raw_states, raw_actions = imagine_forward(
        imagination_start, config.imagination_horizon, graph, config,
        graph.heads.action, stop_grad_post_action=False,
        stop_grad_pre_action=config.stop_grad_pre_action, return_actions=True)
  else:
    raw_states = None
  objectives = []
  for name, scale in sorted(config.loss_scales.items(), key=lambda x: x[0]):
    if config.loss_scales[name] == 0.0:
      continue
    if name in config.heads and name not in config.gradient_heads:
      features = tf.stop_gradient(raw_features)
      include = r'.*/head_{}/.*'.format(name)
      exclude = None
    else:
      features = raw_features
      include = None
      exclude = None

    if name == 'divergence':
      loss = graph.cell.divergence_from_states(posterior, prior)
      if config.free_nats is not None:
        loss = tf.maximum(0.0, loss - float(config.free_nats))
      objectives.append(Objective('divergence', loss, min, include, exclude))

    elif name == 'cpc':
      pred = heads.cpc(graph.embedded)
      objective = compute_cpc_loss(pred, features, config)
      objectives.append(Objective('cpc', objective, max, include, exclude))

    elif name == 'overshooting':
      shape = tools.shape(graph.data['action'])
      length = tf.tile(tf.constant(shape[1])[None], [shape[0]])
      _, priors, posteriors, mask = tools.overshooting(
          graph.cell, {}, graph.embedded, graph.data['action'], length,
          config.overshooting_distance, posterior)
      posteriors, priors, mask = tools.nested.map(
          lambda x: x[:, :, 1:-1], (posteriors, priors, mask))
      if config.os_stop_posterior_grad:
        posteriors = tools.nested.map(tf.stop_gradient, posteriors)
      loss = graph.cell.divergence_from_states(posteriors, priors)
      if config.free_nats is not None:
        loss = tf.maximum(0.0, loss - float(config.free_nats))
      objectives.append(Objective('overshooting', loss, min, include, exclude))

    elif name == 'value':
      if config.value_source == 'dataset':
        loss = compute_value_loss(
            config, graph, priors, features, target['reward'])
      elif config.value_source == 'model':
        if 'action_target' in graph.heads or not config.imagination_horizon:
          if 'action_target' in graph.heads:
            policy = graph.heads.action_target
          else:
            policy = graph.heads.action
          states = imagine_forward(
              posterior, config.value_model_horizon, graph, config, policy)
        else:
          states = raw_states

        feat = graph.cell.features_from_state(states)

        if config.combination_run:
            loss = compute_curious_value_loss(config, graph, states, feat, None, actions=raw_actions)
        else:
            loss = compute_value_loss(config, graph, states, feat, None)
      else:
        raise NotImplementedError(config.value_source)
      objectives.append(Objective('value', loss, min, include, exclude))

    elif name == 'curious_value':

        states = curious_raw_states
        feat = graph.cell.features_from_state(states)
        loss = compute_curious_value_loss(config, graph, states, feat, None, actions=curious_actions)
        objectives.append(Objective('curious_value', loss, min, include, exclude))

    elif name == 'action':
      if config.action_source == 'model':
        if not config.imagination_horizon:
          states = imagine_forward(
              posterior, config.action_model_horizon, graph, config,
              policy=graph.heads.action, stop_grad_post_action=False)
        else:
          states = raw_states
        feat = graph.cell.features_from_state(states)
        if config.combination_run:
            objective = compute_curious_action_values(config, graph, states, feat, actions=raw_actions)
        else:
            objective = compute_action_values(config, graph, states, feat)

        objectives.append(Objective(
            'action', objective, max, include, exclude))
      elif config.action_source == 'dataset':
        objective = heads.action(features).log_prob(target[name])
        objective -= compute_action_divergence(features, graph, config)
        objectives.append(Objective(
            'action', objective, max, include, exclude))
      else:
        raise NotImplementedError(config.action_source)

    elif name == 'curious_action':

        states = curious_raw_states
        feat = graph.cell.features_from_state(states)
        objective = compute_curious_action_values(config, graph, states, feat, actions=curious_actions)
        objectives.append(Objective(
            'curious_action', objective, max, include, exclude))

    elif name == 'reward':
      if config.curious_run and config.freeze_extrinsic_heads:
          if config.adaptation:
              features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'),
                                graph.global_step < tf.cast(config.adaptation_step, tf.int64)),
                                lambda: tf.stop_gradient(features),
                                lambda: features)
          else:

              features = tf.stop_gradient(features)
      elif config.random_run and config.freeze_extrinsic_heads:
          if config.adaptation:
              features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'),
                                graph.global_step < tf.cast(config.adaptation_step, tf.int64)),
                                lambda: tf.stop_gradient(features),
                                lambda: features)
          else:

              features = tf.stop_gradient(features)

      elif config.vanilla_curious_run and config.freeze_extrinsic_heads:
          if config.adaptation:
              features = tf.cond(tf.logical_and(tf.equal(graph.phase, 'train'),
                                graph.global_step < tf.cast(config.adaptation_step, tf.int64)),
                                lambda: tf.stop_gradient(features),
                                lambda: features)
          else:

              features = tf.stop_gradient(features)

      if config.combination_run:
          intrinsic_target = compute_intrinsic_reward(config, graph, posterior, target['action'], features)
          final_target = tf.math.scalar_mul(config.extrinsic_coeff, target[name]) + tf.math.scalar_mul(config.intrinsic_coeff, intrinsic_target)
      else:
          final_target = target[name]

      reward_mask = tf.squeeze(target['reward_mask'], [-1])
      logprob = heads.reward(features).log_prob(final_target) * reward_mask
      objectives.append(Objective('reward', logprob, max, include, exclude))

    elif name == 'pcont' and config.pcont_label_weight:
      terminal = tf.cast(tf.less(target[name], 0.5), tf.float32)
      logprob = heads[name](features).log_prob(target[name])
      logprob *= 1 + terminal * (config.pcont_label_weight - 1)
      objectives.append(Objective(name, logprob, max, include, exclude))

    elif 'one_step_model' in name:
      mdl = int(name[-1])
      model_types = {"modeltype_1": modeltype_1, "modeltype_2": modeltype_2, "modeltype_3": modeltype_3, "modeltype_4": modeltype_4}
      action, target_prediction, input_state = model_types["modeltype_"+str(config.ensemble_model_type)](config,graph,target,sample_with_replacement,prior,posterior,mdl)
      prediction = graph.one_step_models[mdl](input_state,action).mean()
      loss = tf.reduce_mean((prediction - tf.stop_gradient(target_prediction)) ** 2, -1)
      loss *= config.ensemble_loss_scale
      objectives.append(Objective('one_step_model_'+str(mdl), loss, min, include, exclude))

    else:
      if name=='reward_int':
          reconstruction_loss = heads['image'](features).log_prob(target['image'])
          full_model_loss = reconstruction_loss - tf.maximum(0.0, graph.cell.divergence_from_states(posterior,prior) - float(3.0))
          intrinsic_target = tf.stop_gradient(-full_model_loss)
          intrinsic_target = tf.math.multiply(intrinsic_target,1e-3)
          logprob = heads[name](features).log_prob(intrinsic_target)
      else:
          logprob = heads[name](features).log_prob(target[name])
      objectives.append(Objective(name, logprob, max, include, exclude))

  objectives = [o._replace(value=tf.reduce_mean(o.value)) for o in objectives]
  return objectives
Exemplo n.º 9
0
def define_model(logdir, metrics, data, trainer, config):
    print('Build TensorFlow compute graph.')
    dependencies = []
    cleanups = []
    step = trainer.step
    global_step = trainer.global_step
    phase = trainer.phase
    timestamp = tf.py_func(
        lambda: datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%S'), [],
        tf.string)
    dependencies.append(
        metrics.set_tags(global_step=global_step,
                         step=step,
                         phase=phase,
                         time=timestamp))

    # Instantiate network blocks. Note, this initialization would be expensive
    # when using tf.function since it would run at every step.
    try:
        cell = config.cell()
    except TypeError:
        cell = config.cell(action_size=data['action'].shape[-1].value)

    one_step_models = []

    kwargs = dict(create_scope_now_=True)
    kwargs['encoder_feature_shape'] = config.encoder_feature_shape
    encoder = tf.make_template('encoder', config.encoder, **kwargs)
    heads = tools.AttrDict(_unlocked=True)
    raw_dummy_features = cell.features_from_state(
        cell.zero_state(1, tf.float32))[:, None]
    for key, head in config.heads.items():
        name = 'head_{}'.format(key)
        kwargs = dict(create_scope_now_=True)
        if key in data:
            kwargs['data_shape'] = data[key].shape[2:].as_list()
        if key == 'reward_int':
            kwargs['data_shape'] = data['reward'].shape[2:].as_list()
        if key == 'action_target':
            kwargs['data_shape'] = data['action'].shape[2:].as_list()
        if key == 'curious_action':
            kwargs['data_shape'] = data['action'].shape[2:].as_list()
        if key == 'cpc':
            kwargs['data_shape'] = [cell.feature_size]
            dummy_features = encoder(data)[:1, :1]
        else:
            dummy_features = raw_dummy_features
        heads[key] = tf.make_template(name, head, **kwargs)
        heads[key](dummy_features)  # Initialize weights.

    if config.curious_run or config.combination_run:
        for mdl in range(config.num_models):
            with tf.variable_scope('one_step_model_' + str(mdl)):
                name = 'one_step_model_' + str(mdl)
                kwargs = dict(create_scope_now_=True)
                kwargs['max_objective'] = config.use_max_objective
                if config.ensemble_model_type == 1:
                    kwargs['data_shape'] = [config.encoder_feature_shape]
                elif config.ensemble_model_type == 2:
                    kwargs['data_shape'] = [
                        tools.shape(cell.zero_state(1,
                                                    tf.float32)['belief'])[-1]
                    ]
                elif config.ensemble_model_type == 3:
                    kwargs['data_shape'] = [
                        tools.shape(cell.zero_state(1,
                                                    tf.float32)['sample'])[-1]
                    ]
                elif config.ensemble_model_type == 4:
                    kwargs['data_shape'] = [tools.shape(dummy_features)[-1]]
                kwargs['model_width_factor'] = config.model_width_factor
                one_step_models.append(
                    tf.make_template(name, config.one_step_model, **kwargs))

    # Update target networks.
    if 'value_target' in heads:
        dependencies.append(
            tools.track_network(trainer, config.batch_shape[0],
                                r'.*/head_value/.*',
                                r'.*/head_value_target/.*',
                                config.value_target_period,
                                config.value_target_update))
    if 'value_target_2' in heads:
        dependencies.append(
            tools.track_network(trainer, config.batch_shape[0],
                                r'.*/head_value/.*',
                                r'.*/head_value_target_2/.*',
                                config.value_target_period,
                                config.value_target_update))
    if 'action_target' in heads:
        dependencies.append(
            tools.track_network(trainer, config.batch_shape[0],
                                r'.*/head_action/.*',
                                r'.*/head_action_target/.*',
                                config.action_target_period,
                                config.action_target_update))

    # Apply and optimize model.
    embedded = encoder(data)
    with tf.control_dependencies(dependencies):
        embedded = tf.identity(embedded)
    graph = tools.AttrDict(locals())
    prior, posterior = tools.unroll.closed_loop(cell, embedded, data['action'],
                                                config.debug)
    objectives = utility.compute_objectives(posterior, prior, data, graph,
                                            config)
    summaries, grad_norms = utility.apply_optimizers(objectives, trainer,
                                                     config)
    dependencies += summaries

    # Active data collection.
    with tf.variable_scope('collection'):
        with tf.control_dependencies(
                dependencies):  # Make sure to train first.
            for name, params in config.train_collects.items():
                schedule = tools.schedule.binary(step, config.batch_shape[0],
                                                 params.steps_after,
                                                 params.steps_every,
                                                 params.steps_until)
                summary, _ = tf.cond(tf.logical_and(
                    tf.equal(trainer.phase, 'train'), schedule),
                                     functools.partial(utility.simulate,
                                                       metrics,
                                                       config,
                                                       params,
                                                       graph,
                                                       cleanups,
                                                       gif_summary=False,
                                                       name=name),
                                     lambda:
                                     (tf.constant(''), tf.constant(0.0)),
                                     name='should_collect_' + name)
                summaries.append(summary)
                dependencies.append(summary)

    # Compute summaries.
    graph = tools.AttrDict(locals())
    summary, score = tf.cond(
        trainer.log,
        lambda: define_summaries.define_summaries(graph, config, cleanups),
        lambda: (tf.constant(''), tf.zeros((0, ), tf.float32)),
        name='summaries')
    summaries = tf.summary.merge([summaries, summary])
    dependencies.append(
        utility.print_metrics({ob.name: ob.value
                               for ob in objectives}, step,
                              config.print_metrics_every, 2, 'objectives'))
    dependencies.append(
        utility.print_metrics(grad_norms, step, config.print_metrics_every, 2,
                              'grad_norms'))
    dependencies.append(tf.cond(trainer.log, metrics.flush, tf.no_op))
    with tf.control_dependencies(dependencies):
        score = tf.identity(score)

    return score, summaries, cleanups