def train(sess, env, args, actor, critic, actor_noise, reward_result, lambda_mix): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Get dynamics and initialize prior controller [A, B] = get_linear_dynamics() prior = BasePrior(A, B) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) paths = list() for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0. ep_ave_max_q = 0 obs, action, rewards = [], [], [] #Get optimal reward using optimal control s0 = np.copy(s) ep_reward_opt = 0. for kk in range(int(args['max_episode_len'])): a_prior = prior.getControl_h(s0) a_prior = np.squeeze(np.asarray(a_prior)) a = a_prior s0, r, stop_c, _ = env.step(a) ep_reward_opt += r if (stop_c): break env.reset() sp = env.unwrapped.reset(s) reward_lqr = 0. while True: a_lqr = prior.getControl(sp) a_lqr = np.squeeze(np.asarray(a_lqr)) sp, reward_p, done_p, _ = env.step(a_lqr) reward_lqr += reward_p if done_p: break # Get reward using regRL algorithm env.reset() s = env.unwrapped.reset(s) for j in range(int(args['max_episode_len'])): # Set control prior regularization weight # lambda_mix = 5. # Prior control a_prior = prior.getControl_h(s) a_prior = np.squeeze(np.asarray(a_prior)) # Rl control with exploration noise a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i)) # Mix the actions (RL controller + control prior) act = a[0] / (1 + lambda_mix) + (lambda_mix / (1 + lambda_mix)) * a_prior # Take action and observe next state/reward s2, r, terminal, info = env.step(act) # Add info from time step to the replay buffer replay_buffer.add( np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, )), np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior, (actor.a_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): #Sample a batch from the replay buffer s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) a_batch = a_batch_0 # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Calculate TD-Error for each state base_q = critic.predict_target(s_batch, actor.predict_target(s_batch)) target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) s = s2 ep_reward += r obs.append(s) rewards.append(r) action.append(a[0]) # Collect results at end of episode if terminal: for ii in range(len(obs)): obs[ii] = obs[ii].reshape((4, 1)) print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \ i, (ep_ave_max_q / float(j)))) reward_result[0, i] = ep_reward reward_result[1, i] = ep_reward_opt reward_result[2, i] = reward_lqr path = { "Observation": np.concatenate(obs).reshape((-1, 4)), "Action": np.concatenate(action), "Reward": np.asarray(rewards) } paths.append(path) break return [summary_ops, summary_vars, paths]
sp = np.copy(s0) reward_prior = 0. while True: a_prior = prior.getControl_h(sp) a_prior = np.squeeze(np.asarray(a_prior)) sp, reward_p, done_p, _ = env.step(a_prior) reward_prior += reward_p if done_p: break env.reset() sp = env.unwrapped.reset(s0) reward_lqr = 0. while True: a_lqr = prior.getControl(sp) a_lqr = np.squeeze(np.asarray(a_lqr)) sp, reward_p, done_p, _ = env.step(a_lqr) reward_lqr += reward_p if done_p: break env.reset() s = env.unwrapped.reset(s0) #s = env.reset() ep_r, ep_t, ep_a = 0, 0, [] while True: a, v = ppo.evaluate_state(s) a = np.squeeze(a) s = np.squeeze(s)[np.newaxis,:]