Пример #1
0
class train_params:
    
    # Environment parameters
    ENV = 'Pendulum-v0'                     # Environment to use (must have low dimensional state space (i.e. not image) and continuous action space)
    RENDER = False                          # Whether or not to display the environment on the screen during training
    RANDOM_SEED = 99999999                  # Random seed for reproducability
    NUM_AGENTS = 4                          # Number of distributed agents to run simultaneously
    
    # Create dummy environment to get all environment params
    if ENV == 'Pendulum-v0':
        dummy_env = PendulumWrapper()
    elif ENV == 'LunarLanderContinuous-v2':
        dummy_env = LunarLanderContinuousWrapper()
    elif ENV == 'BipedalWalker-v2':
        dummy_env = BipedalWalkerWrapper()
    elif ENV == 'BipedalWalkerHardcore-v2':
        dummy_env = BipedalWalkerWrapper(hardcore=True)
    else:
        raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py')
     
    STATE_DIMS = dummy_env.get_state_dims()
    STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds()
    ACTION_DIMS = dummy_env.get_action_dims()
    ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds()
    V_MIN = dummy_env.v_min
    V_MAX = dummy_env.v_max
    del dummy_env
    
    # Training parameters
    BATCH_SIZE = 256
    NUM_STEPS_TRAIN = 1000000       # Number of steps to train for
    MAX_EP_LENGTH = 10000           # Maximum number of steps per episode
    REPLAY_MEM_SIZE = 1000000       # Soft maximum capacity of replay memory
    REPLAY_MEM_REMOVE_STEP = 200    # Check replay memory every REPLAY_MEM_REMOVE_STEP training steps and remove samples over REPLAY_MEM_SIZE capacity
    PRIORITY_ALPHA = 0.6            # Controls the randomness vs prioritisation of the prioritised sampling (0.0 = Uniform sampling, 1.0 = Greedy prioritisation)
    PRIORITY_BETA_START = 0.4       # Starting value of beta - controls to what degree IS weights influence the gradient updates to correct for the bias introduced by priority sampling (0 - no correction, 1 - full correction)
    PRIORITY_BETA_END = 1.0         # Beta will be linearly annealed from its start value to this value throughout training
    PRIORITY_EPSILON = 0.00001      # Small value to be added to updated priorities to ensure no sample has a probability of 0 of being chosen
    NOISE_SCALE = 0.3               # Scaling to apply to Gaussian noise
    NOISE_DECAY = 0.9999            # Decay noise throughout training by scaling by noise_decay**training_step
    DISCOUNT_RATE = 0.99            # Discount rate (gamma) for future rewards
    N_STEP_RETURNS = 5              # Number of future steps to collect experiences for N-step returns
    UPDATE_AGENT_EP = 10            # Agent gets latest parameters from learner every update_agent_ep episodes
    
    # Network parameters
    CRITIC_LEARNING_RATE = 0.0001
    ACTOR_LEARNING_RATE = 0.0001
    CRITIC_L2_LAMBDA = 0.0          # Coefficient for L2 weight regularisation in critic - if 0, no regularisation is performed
    DENSE1_SIZE = 400               # Size of first hidden layer in networks
    DENSE2_SIZE = 300               # Size of second hidden layer in networks
    FINAL_LAYER_INIT = 0.003        # Initialise networks' final layer weights in range +/-final_layer_init
    NUM_ATOMS = 51                  # Number of atoms in output layer of distributional critic
    TAU = 0.001                     # Parameter for soft target network updates
    USE_BATCH_NORM = False          # Whether or not to use batch normalisation in the networks
  
    # Files/Directories
    SAVE_CKPT_STEP = 10000                  # Save checkpoint every save_ckpt_step training steps
    CKPT_DIR = './ckpts/' + ENV             # Directory for saving/loading checkpoints
    CKPT_FILE = None                        # Checkpoint file to load and resume training from (if None, train from scratch)
    LOG_DIR = './logs/train/' + ENV         # Directory for saving Tensorboard logs (if None, do not save logs)
Пример #2
0
    def __init__(self, sess, env, seed, n_agent=0):
        print("Initialising agent %02d... \n" % n_agent)

        self.sess = sess
        self.n_agent = n_agent

        # Create environment
        if env == 'Pendulum-v0':
            self.env_wrapper = PendulumWrapper(env)
        elif env == 'LunarLanderContinuous-v2':
            self.env_wrapper = LunarLanderContinuousWrapper(env)
        elif env == 'Ant-v2':
            self.env_wrapper = AntWrapper(env)
        elif env == 'BipedalWalker-v2':
            self.env_wrapper = BipedalWalkerWrapper(env)
        elif env == 'HalfCheetah-v2':
            self.env_wrapper = CheetahWrapper(env)
        elif env == 'Reacher-v2':
            self.env_wrapper = ReacherWrapper(env)
        elif env == 'Hopper-v2':
            self.env_wrapper = HopperWrapper(env)
        elif env == 'Swimmer-v2':
            self.env_wrapper = SwimmerWrapper(env)
        elif env == 'Walker2d-v2':
            self.env_wrapper = Walker2dWrapper(env)
        elif env == 'InvertedPendulum-v2':
            self.env_wrapper = InvertedPendulumWrapper(env)
        elif env == 'Humanoid-v2':
            self.env_wrapper = InvertedPendulumWrapper(env)
        else:
            raise Exception(
                'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py'
            )
        self.env_wrapper.set_random_seed(seed * (n_agent + 1))
Пример #3
0
 def __init__(self, sess, env, seed, n_agent=0):
     print("Initialising agent %02d... \n" % n_agent)
      
     self.sess = sess        
     self.n_agent = n_agent
    
     # Create environment    
     if env == 'Pendulum-v0':
         self.env_wrapper = PendulumWrapper(env)
     elif env == 'LunarLanderContinuous-v2':
         self.env_wrapper = LunarLanderContinuousWrapper(env)
     elif env == 'BipedalWalker-v2':
         self.env_wrapper = BipedalWalkerWrapper(env)
     else:
         raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py')
     self.env_wrapper.set_random_seed(seed*(n_agent+1))
Пример #4
0
    def __init__(self, PER_memory, run_agent_event, stop_agent_event):
        self.PER_memory = PER_memory
        self.run_agent_event = run_agent_event
        self.stop_agent_event = stop_agent_event

        if train_params.ENV == 'Pendulum-v0':
            self.eval_env = PendulumWrapper()
        elif train_params.ENV == 'LunarLanderContinuous-v2':
            self.eval_env = LunarLanderContinuousWrapper()
        elif train_params.ENV == 'BipedalWalker-v2':
            self.eval_env = BipedalWalkerWrapper()
        elif train_params.ENV == 'BipedalWalkerHardcore-v2':
            self.eval_env = BipedalWalkerWrapper(hardcore=True)
        else:
            raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py')

        self.summary_writer = tf.summary.create_file_writer(train_params.LOG_DIR + '/eval/')
Пример #5
0
class play_params:
    ALGO = 'D4PG_2'
    ENV = 'BipedalWalker-v2'
    CKPT = '99000'

    # Create dummy environment to get all environment params
    if ENV == 'Pendulum-v0':
        dummy_env = PendulumWrapper()
    elif ENV == 'LunarLanderContinuous-v2':
        dummy_env = LunarLanderContinuousWrapper()
    elif ENV == 'BipedalWalker-v2':
        dummy_env = BipedalWalkerWrapper()
    elif ENV == 'BipedalWalkerHardcore-v2':
        dummy_env = BipedalWalkerWrapper(hardcore=True)
    else:
        raise Exception(
            'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py'
        )

    STATE_DIMS = dummy_env.get_state_dims()
    STATE_BOUND_LOW, STATE_BOUND_HIGH = dummy_env.get_state_bounds()
    ACTION_DIMS = dummy_env.get_action_dims()
    ACTION_BOUND_LOW, ACTION_BOUND_HIGH = dummy_env.get_action_bounds()
    V_MIN = dummy_env.v_min
    V_MAX = dummy_env.v_max
    del dummy_env

    import os
    ACTOR_MODEL_DIR = os.getcwd(
    ) + '/data/' + ENV + '/' + ALGO + '/eval/actor_' + CKPT
    CRITIC_MODEL_DIR = os.getcwd(
    ) + '/data/' + ENV + '/' + ALGO + '/eval/critic_' + CKPT
    RECORD_DIR = os.getcwd(
    ) + '/data/' + ENV + '/' + ALGO + '/eval/video_' + CKPT

    # Play parameters
    NUM_EPS_PLAY = 1  # Number of episodes to play for
    MAX_EP_LENGTH = 10000  # Maximum number of steps per episode
Пример #6
0
class Agent:
  
    def __init__(self, sess, env, seed, n_agent=0):
        print("Initialising agent %02d... \n" % n_agent)
         
        self.sess = sess        
        self.n_agent = n_agent
       
        # Create environment    
        if env == 'Pendulum-v0':
            self.env_wrapper = PendulumWrapper()
        elif env == 'LunarLanderContinuous-v2':
            self.env_wrapper = LunarLanderContinuousWrapper()
        elif env == 'BipedalWalker-v2':
            self.env_wrapper = BipedalWalkerWrapper()
        elif env == 'BipedalWalkerHardcore-v2':
            self.env_wrapper = BipedalWalkerWrapper(hardcore=True)
        else:
            raise Exception('Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py')
        self.env_wrapper.set_random_seed(seed*(n_agent+1))
              
    def build_network(self, training):
        # Input placeholder    
        self.state_ph = tf.placeholder(tf.float32, ((None,) + train_params.STATE_DIMS)) 
        
        if training:
            # each agent has their own var_scope
            var_scope = ('actor_agent_%02d'%self.n_agent)
        else:
            # when testing, var_scope comes from main learner policy (actor) network
            var_scope = ('learner_actor_main')
          
        # Create policy (actor) network
        if train_params.USE_BATCH_NORM:
            self.actor_net = Actor_BN(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, is_training=False, scope=var_scope)
            self.agent_policy_params = self.actor_net.network_params + self.actor_net.bn_params
        else:
            self.actor_net = Actor(self.state_ph, train_params.STATE_DIMS, train_params.ACTION_DIMS, train_params.ACTION_BOUND_LOW, train_params.ACTION_BOUND_HIGH, train_params.DENSE1_SIZE, train_params.DENSE2_SIZE, train_params.FINAL_LAYER_INIT, scope=var_scope)
            self.agent_policy_params = self.actor_net.network_params
                        
    def build_update_op(self, learner_policy_params):
        # Update agent's policy network params from learner
        update_op = []
        from_vars = learner_policy_params
        to_vars = self.agent_policy_params
                
        for from_var,to_var in zip(from_vars,to_vars):
            update_op.append(to_var.assign(from_var))
        
        self.update_op = update_op
                        
    def build_summaries(self, logdir):
        # Create summary writer to write summaries to disk
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.summary_writer = tf.summary.FileWriter(logdir, self.sess.graph)
        
        # Create summary op to save episode reward to Tensorboard log
        self.ep_reward_var = tf.Variable(0.0, trainable=False, name=('ep_reward_agent_%02d'%self.n_agent))
        tf.summary.scalar("Episode Reward", self.ep_reward_var)
        self.summary_op = tf.summary.merge_all()
        
        # Initialise reward var - this will not be initialised with the other network variables as these are copied over from the learner
        self.init_reward_var = tf.variables_initializer([self.ep_reward_var])
            
    def run(self, PER_memory, gaussian_noise, run_agent_event, stop_agent_event):
        # Continuously run agent in environment to collect experiences and add to replay memory
                
        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()
        
        # Perform initial copy of params from learner to agent
        self.sess.run(self.update_op)
        
        # Initialise var for logging episode reward
        if train_params.LOG_DIR is not None:
            self.sess.run(self.init_reward_var)
        
        # Initially set threading event to allow agent to run until told otherwise
        run_agent_event.set()
        
        num_eps = 0
        
        while not stop_agent_event.is_set():
            num_eps += 1
            # Reset environment and experience buffer
            state = self.env_wrapper.reset()
            state = self.env_wrapper.normalise_state(state)
            self.exp_buffer.clear()
            
            num_steps = 0
            episode_reward = 0
            ep_done = False
            
            while not ep_done:
                num_steps += 1
                ## Take action and store experience
                if train_params.RENDER:
                    self.env_wrapper.render()
                action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0]     # Add batch dimension to single state input, and remove batch dimension from single action output
                action += (gaussian_noise() * train_params.NOISE_DECAY**num_eps)
                next_state, reward, terminal = self.env_wrapper.step(action)
                
                episode_reward += reward 
                               
                next_state = self.env_wrapper.normalise_state(next_state)
                reward = self.env_wrapper.normalise_reward(reward)
                
                self.exp_buffer.append((state, action, reward))
                
                # We need at least N steps in the experience buffer before we can compute Bellman rewards and add an N-step experience to replay memory
                if len(self.exp_buffer) >= train_params.N_STEP_RETURNS:
                    state_0, action_0, reward_0 = self.exp_buffer.popleft()
                    discounted_reward = reward_0
                    gamma = train_params.DISCOUNT_RATE
                    for (_, _, r_i) in self.exp_buffer:
                        discounted_reward += r_i * gamma
                        gamma *= train_params.DISCOUNT_RATE
                    
                    # If learner is requesting a pause (to remove samples from PER), wait before adding more samples
                    run_agent_event.wait()   
                    PER_memory.add(state_0, action_0, discounted_reward, next_state, terminal, gamma)
                
                state = next_state
                
                if terminal or num_steps == train_params.MAX_EP_LENGTH:
                    # Log total episode reward
                    if train_params.LOG_DIR is not None:
                        summary_str = self.sess.run(self.summary_op, {self.ep_reward_var: episode_reward})
                        self.summary_writer.add_summary(summary_str, num_eps)
                    # Compute Bellman rewards and add experiences to replay memory for the last N-1 experiences still remaining in the experience buffer
                    while len(self.exp_buffer) != 0:
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = train_params.DISCOUNT_RATE
                        for (_, _, r_i) in self.exp_buffer:
                            discounted_reward += r_i * gamma
                            gamma *= train_params.DISCOUNT_RATE
                        
                        # If learner is requesting a pause (to remove samples from PER), wait before adding more samples
                        run_agent_event.wait()     
                        PER_memory.add(state_0, action_0, discounted_reward, next_state, terminal, gamma)
                    
                    # Start next episode
                    ep_done = True
                
            # Update agent networks with learner params every 'update_agent_ep' episodes
            if num_eps % train_params.UPDATE_AGENT_EP == 0:
                self.sess.run(self.update_op)
        
        self.env_wrapper.close()
    
    def test(self):   
        # Test a saved ckpt of actor network and save results to file (optional)
        
        def load_ckpt(ckpt_dir, ckpt_file):
            # Load ckpt given by ckpt_file, or else load latest ckpt in ckpt_dir
            loader = tf.train.Saver()    
            if ckpt_file is not None:
                ckpt = ckpt_dir + '/' + ckpt_file  
            else:
                ckpt = tf.train.latest_checkpoint(ckpt_dir)
             
            loader.restore(self.sess, ckpt)
            sys.stdout.write('%s restored.\n\n' % ckpt)
            sys.stdout.flush() 
             
            ckpt_split = ckpt.split('-')
            self.train_ep = ckpt_split[-1]
        
        # Load ckpt from ckpt_dir
        load_ckpt(test_params.CKPT_DIR, test_params.CKPT_FILE)
        
        # Create Tensorboard summaries to save episode rewards
        if test_params.LOG_DIR is not None:
            self.build_summaries(test_params.LOG_DIR)
            
        rewards = [] 

        for test_ep in range(1, test_params.NUM_EPS_TEST+1):
            state = self.env_wrapper.reset()
            state = self.env_wrapper.normalise_state(state)
            ep_reward = 0
            step = 0
            ep_done = False
            
            while not ep_done:
                if test_params.RENDER:
                    self.env_wrapper.render()
                action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0]     # Add batch dimension to single state input, and remove batch dimension from single action output
                state, reward, terminal = self.env_wrapper.step(action)
                state = self.env_wrapper.normalise_state(state)
                
                ep_reward += reward
                step += 1
                 
                # Episode can finish either by reaching terminal state or max episode steps
                if terminal or step == test_params.MAX_EP_LENGTH:
                    sys.stdout.write('\x1b[2K\rTest episode {:d}/{:d}'.format(test_ep, test_params.NUM_EPS_TEST))
                    sys.stdout.flush()   
                    rewards.append(ep_reward)
                    ep_done = True   
                
        mean_reward = np.mean(rewards)
        error_reward = ss.sem(rewards)
                
        sys.stdout.write('\x1b[2K\rTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.format(mean_reward, error_reward))
        sys.stdout.flush()  
        
        # Log average episode reward for Tensorboard visualisation
        if test_params.LOG_DIR is not None:
            summary_str = self.sess.run(self.summary_op, {self.ep_reward_var: mean_reward})
            self.summary_writer.add_summary(summary_str, self.train_ep)
         
        # Write results to file        
        if test_params.RESULTS_DIR is not None:
            if not os.path.exists(test_params.RESULTS_DIR):
                os.makedirs(test_params.RESULTS_DIR)
            output_file = open(test_params.RESULTS_DIR + '/' + test_params.ENV + '.txt' , 'a')
            output_file.write('Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.format(self.train_ep, mean_reward, error_reward))
            output_file.flush()
            sys.stdout.write('Results saved to file \n\n')
            sys.stdout.flush()      
        
        self.env_wrapper.close()       
        
    def play(self):   
        # Play a saved ckpt of actor network in the environment, visualise performance on screen and save a GIF (optional)
        
        def load_ckpt(ckpt_dir, ckpt_file):
            # Load ckpt given by ckpt_file, or else load latest ckpt in ckpt_dir
            loader = tf.train.Saver()    
            if ckpt_file is not None:
                ckpt = ckpt_dir + '/' + ckpt_file  
            else:
                ckpt = tf.train.latest_checkpoint(ckpt_dir)
        
            loader.restore(self.sess, ckpt)
            sys.stdout.write('%s restored.\n\n' % ckpt)
            sys.stdout.flush() 
             
            ckpt_split = ckpt.split('-')
            self.train_ep = ckpt_split[-1]
        
        # Load ckpt from ckpt_dir
        load_ckpt(play_params.CKPT_DIR, play_params.CKPT_FILE)
        
        # Create record directory
        if not os.path.exists(play_params.RECORD_DIR):
            os.makedirs(play_params.RECORD_DIR)

        for ep in range(1, play_params.NUM_EPS_PLAY+1):
            state = self.env_wrapper.reset()
            state = self.env_wrapper.normalise_state(state)
            step = 0
            ep_done = False
            
            while not ep_done:
                frame = self.env_wrapper.render()
                if play_params.RECORD_DIR is not None:
                    filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % (ep, step)
                    cv2.imwrite(filepath, frame)
                action = self.sess.run(self.actor_net.output, {self.state_ph:np.expand_dims(state, 0)})[0]     # Add batch dimension to single state input, and remove batch dimension from single action output
                state, _, terminal = self.env_wrapper.step(action)
                state = self.env_wrapper.normalise_state(state)
                
                step += 1
                 
                # Episode can finish either by reaching terminal state or max episode steps
                if terminal or step == play_params.MAX_EP_LENGTH:
                    ep_done = True   
                    
        # Convert saved frames to gif
        if play_params.RECORD_DIR is not None:
            images = []
            for file in sorted(os.listdir(play_params.RECORD_DIR)):
                # Load image
                filename = play_params.RECORD_DIR + '/' + file
                im = cv2.imread(filename)
                images.append(im)
                # Delete static image once loaded
                os.remove(filename)
                 
            # Save as gif
            imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV, images, duration=0.01)  
                    
        self.env_wrapper.close()                   
Пример #7
0
def play():

    if play_params.ENV == 'Pendulum-v0':
        play_env = PendulumWrapper()
    elif play_params.ENV == 'LunarLanderContinuous-v2':
        play_env = LunarLanderContinuousWrapper()
    elif play_params.ENV == 'BipedalWalker-v2':
        play_env = BipedalWalkerWrapper()
    elif play_params.ENV == 'BipedalWalkerHardcore-v2':
        play_env = BipedalWalkerWrapper(hardcore=True)
    else:
        raise Exception(
            'Chosen environment does not have an environment wrapper defined. Please choose an environment with an environment wrapper defined, or create a wrapper for this environment in utils.env_wrapper.py'
        )

    actor_net = Actor(play_params.STATE_DIMS,
                      play_params.ACTION_DIMS,
                      play_params.ACTION_BOUND_LOW,
                      play_params.ACTION_BOUND_HIGH,
                      train_params.DENSE1_SIZE,
                      train_params.DENSE2_SIZE,
                      train_params.FINAL_LAYER_INIT,
                      name='actor_play')
    critic_net = Critic(play_params.STATE_DIMS,
                        play_params.ACTION_DIMS,
                        train_params.DENSE1_SIZE,
                        train_params.DENSE2_SIZE,
                        train_params.FINAL_LAYER_INIT,
                        train_params.NUM_ATOMS,
                        train_params.V_MIN,
                        train_params.V_MAX,
                        name='critic_play')

    actor_net.load_weights(play_params.ACTOR_MODEL_DIR)
    critic_net.load_weights(play_params.CRITIC_MODEL_DIR)

    if not os.path.exists(play_params.RECORD_DIR):
        os.makedirs(play_params.RECORD_DIR)

    for ep in tqdm(range(1, play_params.NUM_EPS_PLAY + 1), desc='playing'):
        state = play_env.reset()
        state = play_env.normalise_state(state)
        step = 0
        ep_done = False

        while not ep_done:
            frame = play_env.render()
            if play_params.RECORD_DIR is not None:
                filepath = play_params.RECORD_DIR + '/Ep%03d_Step%04d.jpg' % (
                    ep, step)
                cv2.imwrite(filepath, frame)
            action = actor_net(np.expand_dims(state.astype(np.float32), 0))[0]
            state, _, terminal = play_env.step(action)
            state = play_env.normalise_state(state)

            step += 1

            # Episode can finish either by reaching terminal state or max episode steps
            if terminal or step == play_params.MAX_EP_LENGTH:
                ep_done = True

    # Convert saved frames to gif
    exit()
    if play_params.RECORD_DIR is not None:
        images = []
        for file in tqdm(sorted(os.listdir(play_params.RECORD_DIR)),
                         desc='converting to gif'):
            # Load image
            filename = play_params.RECORD_DIR + '/' + file
            im = cv2.imread(filename)
            images.append(im)
            # Delete static image once loaded
            os.remove(filename)

        # Save as gif
        print("Saving to ", play_params.RECORD_DIR)
        imageio.mimsave(play_params.RECORD_DIR + '/%s.gif' % play_params.ENV,
                        images[:-1],
                        duration=0.01)

    play_env.close()