def main(args): with tf.Session() as sess: saver = tf.train.Saver() env = ArmEnv() # np.random.seed(int(args['random_seed'])) # tf.set_random_seed(int(args['random_seed'])) state_dim = env.state_dim action_dim = env.action_dim action_bound = max(env.action_bound) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim), sigma=args['sigma']) train(sess, env, args, actor, critic, actor_noise) if args['save_model']: del_pre_model() saver.save(sess, './results/tf_model/model') # if args['use_gym_monitor']: # env.close() sess.close()
from env import ArmEnv from rl import DDPG MAX_EPISODES = 900 MAX_EP_STEPS = 200 ON_TRAIN = False # set env env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set RL method (continuous) rl = DDPG(a_dim, s_dim, a_bound) steps = [] def train(): # start training for i in range(MAX_EPISODES): s = env.reset() ep_r = 0. for j in range(MAX_EP_STEPS): env.render() a = rl.choose_action(s) s_, r, done = env.step(a) rl.store_transition(s, a, r, s_)
"n_arms": args.arms, "max_ep": 1001, "max_step": 150, "soft_replace": True, "random_target": True, "tau": 0.001, "gamma": 0.8, "lr": 0.0001, "memory_capacity": 9000 } # set env print(PARAMS) env = ArmEnv(n_arms=PARAMS["n_arms"], random_goal=PARAMS["random_target"], on_mouse=False if PARAMS["training"] else True, show_fps=args.show_fps, fps=args.fps) s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set RL method (continuous) rl = DDPG( a_dim, s_dim, a_bound, soft_replace=PARAMS["soft_replace"], tau=PARAMS["tau"], gamma=PARAMS["gamma"], lr=PARAMS["lr"],
def train(sess, env, args, actor, critic, actor_noise): def eval_reward(env, actor, max_episode_len, episode_i): #evaluate actor network without noise ep_num = 5 ep_reward = 0 for i in range(ep_num): # s=env.reset_to_value(rad_unit*i) s = env.reset() for k in range(max_episode_len): a = actor.predict_target(np.reshape(s, (1, actor.s_dim))) s2, r, terminal = env.step(a[0]) ep_reward += r if terminal: break s = s2 ep_reward //= ep_num # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed)) print('episode: %d,Episodic Reward: %d' % (episode_i, ep_reward)) return ep_reward def save_reward(lst, args): base_dir = './results/rewards/' time_stamp = time.strftime('%m%d__%H%M%S') base_dir += time_stamp os.makedirs(base_dir, exist_ok=1) save_file_name = os.path.join(base_dir, 'rwd.dat') file = open(save_file_name, 'wb') dump_dict = {} dump_dict['rewards'] = lst dump_dict['actor_lr'] = args['actor_lr'] dump_dict['critic_lr'] = args['critic_lr'] dump_dict['sigma'] = args['sigma'] pickle.dump(dump_dict, file, 1) plt.plot(lst) plt.title(time_stamp) plt.xlabel('Episodes') plt.ylabel('Average Reward') plt.ylim([-200, 0]) fig_name = base_dir + '/reward_fig.png' plt.savefig(fig_name) print('Rewards sucessfully writed!') sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) env_eval = ArmEnv() reward_list = [] for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: # summary_str = sess.run(summary_ops, feed_dict={ # summary_vars[0]: ep_reward, # summary_vars[1]: ep_ave_max_q / float(j) # }) # writer.add_summary(summary_str, i) # writer.flush() # print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ # i, (ep_ave_max_q / float(j)))) break eval_r = eval_reward(env_eval, actor, int(args['max_episode_len']), i) reward_list.append(eval_r) save_reward(reward_list, args)
# Mail : [email protected] # Create Time : 2017-12-04 23:36:17 ############################################### import os import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from env import ArmEnv from rl import DDPG MAX_EPISODES = 500 MAX_EP_STEPS = 200 env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound rl = DDPG(a_dim, s_dim, a_bound) for i in range(MAX_EPISODES): s = evn.reset() for j in range(MAX_EP_STEPS): env.render() a = rl.choose_actions(s) s_, r, done = env.step(a) rl.store_transition(s, a, r, s_) if rl.memory_full(): rl.learn()
display.clear_output(wait=True) display.display(plt.gcf()) plt.figure(figsize=(20,10)) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(score) plt.plot(mean) plt.text(len(score)-1, score[-1], str(score[-1])) plt.text(len(mean)-1, mean[-1], str(mean[-1])) if __name__ == '__main__': env = ArmEnv() params = { 'gamma': 0.9, 'epsi_high': 0.9, 'epsi_low': 0.05, 'decay': 200, 'lr': 0.001, 'capacity': 10000, 'batch_size': 32, 'state_space_dim': env.state_dim, 'action_space_dim': env.action_dim, } agent = Agent(**params) score = []
from env import ArmEnv from brain import DDPG MAX_EPISODES = 500 MAX_EP_STEPS = 200 ON_TRAIN = True # set env env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set RL method rl = DDPG(a_dim, s_dim, a_bound) steps = [] # start training def train(): for i in range(MAX_EPISODES): s = env.reset() ep_r = 0 for j in range(MAX_EP_STEPS): env.render() a = rl.choose_action(s) s_, r, done = env.step(a) rl.store_transition(s, a, r, s_)
# Main from env import ArmEnv from rl import DDPG import random MAX_EPISODES = 300 MAX_EP_STEPS = 200 ON_TRAIN = True # set env env = ArmEnv() env.get_train_state = ON_TRAIN s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set RL method (continuous) rl = DDPG(a_dim, s_dim, a_bound) rl.get_train_state = ON_TRAIN def train(): # start training sample_goal = [None] * 36 for incx in range(6): for incy in range(6): sample_goal[incy * 6 + incx] = { 'x': (100. + incx * 40), 'y': (100. + incy * 40), 'l': 40 }
# env.reset() # env.render() # env.step() # # # # ========================================================================= # from env import ArmEnv from rl import DDPG # Gloabel Variable MAX_EPISOSES = 500 MAX_EP_STEPS = 500 # Set the environement env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set the RL method rl = DDPG(a_dim, s_dim, a_bound) # start Training for i in range(MAX_EPISOSES): s = env.reset() for j in range(MAX_EP_STEPS): env.render() a = rl.choose_action(s)