def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True)
if len( sys.argv ) > 2 : session_dir = sys.argv[2] sac.load( session_dir ) if not sac.load_replay_buffer( session_dir + '/replay_buffer.pkl' ) : print( 'Could not find %s: starting with an empty replay buffer.' % ( session_dir + '/replay_buffer.pkl' ) ) np.random.seed( hyper_params['seed'] ) training_env = ENV() eval_env = ENV() n_ep = 0 Q_loss = 0 reward_graph = Monitor( [ 1 , 1 ], titles=[ 'Average reward per trial', 'Temperature' ], xlabel='trials', keep=False ) import time start = time.time() with Loop_handler() as interruption : while not interruption() and n_ep < EP_MAX : # Run a new trial: s = training_env.reset() for _ in range( EP_LEN ) : # Choose a random action and execute the next step:
ppo = PPO(**hyper_params) if len(sys.argv) == 1 or sys.argv[1] != 'eval': if len(sys.argv) > 1 and sys.argv[1] == 'load': if len(sys.argv) > 2: session_dir = sys.argv[2] ppo.load(session_dir + '/session') training_env = ENV() eval_env = ENV() n_ep = 0 reward_graph = Monitor(titles='Average reward per trial', xlabel='trials', keep=False) import time start = time.time() with Loop_handler() as interruption: while not interruption() and n_ep < EP_MAX: # Gather new data from the current policy: n_samples = 0 for ep in range(EPISODES_PER_BATCH): s = training_env.reset()
#prob_expl = lambda n : exp( -0.0003*n ) prob_expl = lambda n : 0.2 ntrial = 0 t = 0. x = array( x0 ) Rt = 0. x_data = [ x*180/pi ] diff = 0. restart = False #random.seed( 0 ) reward_graph = Monitor( titles='Average reward per trial', xlabel='trials', keep=False ) with Loop_handler() as interruption : while not interruption() and ntrial < 20000 : # Action selection: exploration = random.rand() < prob_expl( ntrial ) if exploration : u = umax*( 2*random.rand() - 1 ) else : u = umax*actor.eval( scaling( x ) ) u = clip( u, -umax, umax ) # Simulation step: x_prev = array( x )