def main(): config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.log_device_placement = True with tf.Session(config=config) as sess: agent = DeepDeterministicPolicyGradientAgent(env=env) # setup saver util and either load latest ckpt or init variables saver_util = None if opts.ckpt_dir is not None: saver_util = util.SaverUtil(sess, opts.ckpt_dir, opts.ckpt_freq) else: sess.run(tf.global_variables_initializer()) for v in tf.global_variables(): print(v.name, util.shape_and_product_of(v), file=sys.stderr) # print >>sys.stderr, v.name, util.shape_and_product_of(v) # now that we've either init'd from scratch, or loaded up a checkpoint, # we can do any required post init work. agent.post_var_init_setup() #opts.num_eval = 100 # run either eval or training if opts.num_eval > 0: agent.run_eval(opts.num_eval, opts.eval_action_noise) else: agent.run_training(opts.max_num_actions, opts.max_run_time, opts.batch_size, opts.batches_per_step, saver_util) if saver_util is not None: saver_util.force_save() env.reset() # just to flush logging, clumsy :/
def main(): env = bullet_cartpole.BulletCartpole(gui=opts.gui, action_force=opts.action_force, max_episode_len=opts.max_episode_len, initial_force=opts.initial_force, delay=opts.delay, discrete_actions=False, event_log_file=opts.event_log) with tf.Session() as sess: #config=tf.ConfigProto(log_device_placement=True)) as sess: agent = DeepDeterministicPolicyGradientAgent(env=env, agent_opts=opts) # setup saver util and either load latest ckpt, or init if none... saver_util = None ckpt_dir = None if opts.run_id is not None: ckpt_dir = "ckpts/%s" % opts.run_id elif opts.ckpt_dir is not None: ckpt_dir = opts.ckpt_dir if ckpt_dir is not None: saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq) else: sess.run(tf.initialize_all_variables()) # now that we've either init'd from scratch, or loaded up a checkpoint, # we can hook together target networks agent.hook_up_target_networks(opts.target_update_rate) # run either eval or training if opts.num_eval > 0: agent.run_eval(opts.num_eval) else: agent.run_training(opts.max_num_actions, opts.batch_size, saver_util, opts.run_id) if saver_util is not None: saver_util.force_save()
def main(): env = bullet_cartpole.BulletCartpole(gui=opts.gui, action_force=opts.action_force, max_episode_len=opts.max_episode_len, initial_force=opts.initial_force, delay=opts.delay, discrete_actions=True) with tf.Session() as sess: agent = LikelihoodRatioPolicyGradientAgent( env=env, gui=opts.gui, hidden_dim=opts.num_hidden, optimiser=tf.train.AdamOptimizer()) # setup saver util; will load latest ckpt, or init if none... saver_util = None ckpt_dir = None if opts.run_id is not None: ckpt_dir = "ckpts/%s" % opts.run_id elif opts.ckpt_dir is not None: ckpt_dir = opts.ckpt_dir if ckpt_dir is not None: saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq) else: sess.run(tf.initialize_all_variables()) # run either eval or training if opts.num_eval > 0: agent.run_eval(opts.num_eval) else: agent.run_training(opts.num_train_batches, opts.rollouts_per_batch, saver_util) if saver_util is not None: saver_util.force_save()
def __init__(self, opts): self.opts = opts config = tf.ConfigProto() #config.gpu_options.allow_growth = True #config.log_device_placement = True config.gpu_options.per_process_gpu_memory_fraction = 0.5 #opts.gpu_mem_fraction self.sess = tf.Session(config=config) render_shape = (opts.height, opts.width, 3) self.replay_memory = replay_memory.ReplayMemory( opts=opts, state_shape=render_shape, action_dim=2, load_factor=1.2) if opts.event_log_in: self.replay_memory.reset_from_event_log(opts.event_log_in, opts.event_log_in_num) # s1 and s2 placeholders batched_state_shape = [None] + list(render_shape) s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) # initialise base models for value & naf networks. value subportion of net is # explicitly created seperate because it has a target network note: in the case of # --share-input-state-representation the input state network of the value_net will # be reused by the naf.l_value and naf.output_actions net self.value_net = models.ValueNetwork("value", s1, opts) self.target_value_net = models.ValueNetwork("target_value", s2, opts) self.network = models.NafNetwork("naf", s1, s2, self.value_net, self.target_value_net, action_dim=2, opts=opts) with self.sess.as_default(): # setup saver util and either load latest ckpt or init variables self.saver_util = None if opts.ckpt_dir is not None: self.saver_util = util.SaverUtil(self.sess, opts.ckpt_dir, opts.ckpt_freq) else: self.sess.run(tf.initialize_all_variables()) for v in tf.all_variables(): print >> sys.stderr, v.name, util.shape_and_product_of(v) # setup target network self.target_value_net.set_as_target_network_for( self.value_net, 0.01)