def main(_): ''' previous = tf.train.import_meta_graph(SAVE_DIR + '/model.ckpt.meta') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) previous.restore(sess,tf.train.latest_checkpoint(SAVE_DIR+'/')) last_vars = tf.trainable_variables() data = sess.run(last_vars) print('Model Restored') ''' tf.reset_default_graph() with tf.Session() as sess: env = Preon_env(opt.env_params) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) state_dim = 9 action_dim = 3 goal_dim = 2 actor = ActorNetwork(sess, state_dim, action_dim, goal_dim, ACTOR_LEARNING_RATE, TAU, opt.env_params) critic = CriticNetwork(sess, state_dim, action_dim, goal_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), opt.env_params) if opt.train: train(sess, env, actor, critic, action_dim, goal_dim, state_dim) else: test(sess, env, actor, critic, action_dim, goal_dim, state_dim, opt.test_goal)
def main(args): with tf.Session() as session: np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) # initialize ROS interface agent = fake.fake_agent() plant = fake.fake_plant() state_shape = agent.get_state_shape() action_shape = agent.get_action_shape() action_bound = agent.get_action_bound() # initialize function approximators actor_network = ActorNetwork(session, state_shape, action_shape, action_bound, float(args['actor_lr']), float(args['tau']), loss_mask=True) critic_network = CriticNetwork(session, state_shape, action_shape, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor_network.get_num_trainable_vars(), loss_mask=True) predictor_network = fake.fake_predictor() latent_network = fake.fake_latent() learn(session, actor_network, critic_network, predictor_network, agent, plant, latent_network=latent_network, buffer_size=int(args['buffer_size']), batch_size=int(args['batch_size']), trace_length=int(args['trace_length']), update_freq=int(args['update_freq']), pretrain_steps=int(args['pretrain_steps']), update_steps=int(args['update_steps']), max_episodes=int(args['max_episodes']), max_ep_steps=int(args['max_episode_len']), summary_dir=args['summary_dir'])
def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
def main(_): opt = Parameters() np.random.seed(opt.seed) tf.set_random_seed(opt.seed) if opt.train: cluster = tf.train.ClusterSpec({ "ps": opt.parameter_servers, "worker": opt.workers }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): is_chief = (FLAGS.task_index == 0) # count the number of updates global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) step_op = global_step.assign(global_step + 1) env = gym.make(opt.env_name) if is_chief: env = wrappers.Monitor(env, './tmp/', force=True) if opt.env_name == 'MountainCarContinuous-v0': observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) else: scaler = None # Initialize replay memory replay_buffer = ReplayBuffer(opt.rm_size, opt.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] if abs(env.action_space.low[0]) == abs( env.action_space.high[0]): action_scale = abs(env.action_space.high[0]) else: print( 'Error: Action space in current environment is asymmetric! ' ) sys.exit() actor = ActorNetwork(state_dim, action_dim, action_scale, opt.actor_lr, opt.tau, scaler) critic = CriticNetwork(state_dim, action_dim, opt.critic_lr, opt.tau, actor.get_num_trainable_vars(), scaler) # Set up summary Ops train_ops, valid_ops, training_vars, valid_vars = build_summaries( ) init_op = tf.global_variables_initializer() # Add ops to save and restore all the variables. saver = tf.train.Saver(max_to_keep=5) if opt.continue_training: def restore_model(sess): actor.set_session(sess) critic.set_session(sess) saver.restore( sess, tf.train.latest_checkpoint(opt.save_dir + '/')) actor.restore_params(tf.trainable_variables()) critic.restore_params(tf.trainable_variables()) print('***********************') print('Model Restored') print('***********************') else: def restore_model(sess): actor.set_session(sess) critic.set_session(sess) # Initialize target network weights actor.update_target_network() critic.update_target_network() print('***********************') print('Model Initialized') print('***********************') #sv = tf.train.Supervisor(is_chief=is_chief, global_step=global_step, init_op=init_op, summary_op=None, saver=None, init_fn=restore_model) #with sv.prepare_or_wait_for_session(server.target) as sess: with tf.Session(server.target) as sess: sess.run(init_op) restore_model(sess) writer = tf.summary.FileWriter(opt.summary_dir, sess.graph) stats = [] for step in range(opt.max_episodes): ''' if sv.should_stop(): break ''' current_step = sess.run(global_step) # Train normally reward = train(sess, current_step, opt, env, actor, critic, train_ops, training_vars, replay_buffer, writer, is_chief) stats.append(reward) if np.mean(stats[-100:]) > 950 and len(stats) >= 101: print(np.mean(stats[-100:])) print("Solved.") if is_chief: save_model(sess, saver, opt, global_step) break if is_chief and step % opt.valid_freq == opt.valid_freq - 1: #test_r = test(sess, current_step, opt, env, actor, critic, valid_ops, valid_vars, writer) save_model(sess, saver, opt, global_step) # Increase global_step sess.run(step_op) #sv.stop() print('Done') else: # For testing pass