示例#1
0
def main():
    config = tf.ConfigProto()
    #  config.gpu_options.allow_growth = True
    #  config.log_device_placement = True
    with tf.Session(config=config) as sess:
        agent = DeepDeterministicPolicyGradientAgent(env=env)

        # setup saver util and either load latest ckpt or init variables
        saver_util = None
        if opts.ckpt_dir is not None:
            saver_util = util.SaverUtil(sess, opts.ckpt_dir, opts.ckpt_freq)
        else:
            sess.run(tf.global_variables_initializer())

        for v in tf.global_variables():
            print(v.name, util.shape_and_product_of(v), file=sys.stderr)
    #   print >>sys.stderr, v.name, util.shape_and_product_of(v)

    # now that we've either init'd from scratch, or loaded up a checkpoint,
    # we can do any required post init work.
        agent.post_var_init_setup()

        #opts.num_eval = 100
        # run either eval or training
        if opts.num_eval > 0:
            agent.run_eval(opts.num_eval, opts.eval_action_noise)
        else:
            agent.run_training(opts.max_num_actions, opts.max_run_time,
                               opts.batch_size, opts.batches_per_step,
                               saver_util)
            if saver_util is not None:
                saver_util.force_save()

        env.reset()  # just to flush logging, clumsy :/
示例#2
0
def main():
  env = bullet_cartpole.BulletCartpole(gui=opts.gui, action_force=opts.action_force,
                                       max_episode_len=opts.max_episode_len,
                                       initial_force=opts.initial_force, delay=opts.delay,
                                       discrete_actions=False, event_log_file=opts.event_log)

  with tf.Session() as sess:  #config=tf.ConfigProto(log_device_placement=True)) as sess:
    agent = DeepDeterministicPolicyGradientAgent(env=env, agent_opts=opts)

    # setup saver util and either load latest ckpt, or init if none...
    saver_util = None
    ckpt_dir = None
    if opts.run_id is not None:
      ckpt_dir = "ckpts/%s" % opts.run_id
    elif opts.ckpt_dir is not None:
      ckpt_dir = opts.ckpt_dir
    if ckpt_dir is not None:
      saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq)
    else:
      sess.run(tf.initialize_all_variables())

    # now that we've either init'd from scratch, or loaded up a checkpoint,
    # we can hook together target networks
    agent.hook_up_target_networks(opts.target_update_rate)

    # run either eval or training
    if opts.num_eval > 0:
      agent.run_eval(opts.num_eval)
    else:
      agent.run_training(opts.max_num_actions, opts.batch_size, saver_util, opts.run_id)
      if saver_util is not None:
        saver_util.force_save()
示例#3
0
def main():
    env = bullet_cartpole.BulletCartpole(gui=opts.gui,
                                         action_force=opts.action_force,
                                         max_episode_len=opts.max_episode_len,
                                         initial_force=opts.initial_force,
                                         delay=opts.delay,
                                         discrete_actions=True)

    with tf.Session() as sess:
        agent = LikelihoodRatioPolicyGradientAgent(
            env=env,
            gui=opts.gui,
            hidden_dim=opts.num_hidden,
            optimiser=tf.train.AdamOptimizer())

        # setup saver util; will load latest ckpt, or init if none...
        saver_util = None
        ckpt_dir = None
        if opts.run_id is not None:
            ckpt_dir = "ckpts/%s" % opts.run_id
        elif opts.ckpt_dir is not None:
            ckpt_dir = opts.ckpt_dir
        if ckpt_dir is not None:
            saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq)
        else:
            sess.run(tf.initialize_all_variables())

        # run either eval or training
        if opts.num_eval > 0:
            agent.run_eval(opts.num_eval)
        else:
            agent.run_training(opts.num_train_batches, opts.rollouts_per_batch,
                               saver_util)
            if saver_util is not None:
                saver_util.force_save()
示例#4
0
    def __init__(self, opts):
        self.opts = opts

        config = tf.ConfigProto()
        #config.gpu_options.allow_growth = True
        #config.log_device_placement = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.5  #opts.gpu_mem_fraction
        self.sess = tf.Session(config=config)

        render_shape = (opts.height, opts.width, 3)
        self.replay_memory = replay_memory.ReplayMemory(
            opts=opts, state_shape=render_shape, action_dim=2, load_factor=1.2)
        if opts.event_log_in:
            self.replay_memory.reset_from_event_log(opts.event_log_in,
                                                    opts.event_log_in_num)

        # s1 and s2 placeholders
        batched_state_shape = [None] + list(render_shape)
        s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)
        s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)

        # initialise base models for value & naf networks. value subportion of net is
        # explicitly created seperate because it has a target network note: in the case of
        # --share-input-state-representation the input state network of the value_net will
        # be reused by the naf.l_value and naf.output_actions net
        self.value_net = models.ValueNetwork("value", s1, opts)
        self.target_value_net = models.ValueNetwork("target_value", s2, opts)
        self.network = models.NafNetwork("naf",
                                         s1,
                                         s2,
                                         self.value_net,
                                         self.target_value_net,
                                         action_dim=2,
                                         opts=opts)

        with self.sess.as_default():
            # setup saver util and either load latest ckpt or init variables
            self.saver_util = None
            if opts.ckpt_dir is not None:
                self.saver_util = util.SaverUtil(self.sess, opts.ckpt_dir,
                                                 opts.ckpt_freq)
            else:
                self.sess.run(tf.initialize_all_variables())
            for v in tf.all_variables():
                print >> sys.stderr, v.name, util.shape_and_product_of(v)

            # setup target network
            self.target_value_net.set_as_target_network_for(
                self.value_net, 0.01)