示例#1
0
def grid_search():
    """
    Script for trying all combinations of parameters specified
    """
    # PARAMETERS
    number_of_repeats = 1  # Number of iterations per each combination of params

    parameters = {
        'env_name': ['NaoBalancing'],
        'n_workers': [4],
        'max_episodes': [6500],
        'episode_length': [2000],
        'batch_size': [2000],
        'epochs': [8],
        'epsilon': [.2],
        'gamma': [.99],
        'actor_layers': [[256, 256]],
        'critic_layers': [[256, 256]],
        'actor_lr': [.00001],
        'critic_lr': [.00002]
    }

    values = tuple(parameters.values())
    param_iterator = list(itertools.product(*values))
    data = []
    counter = 0
    for params in param_iterator:
        counter += 1
        args = dict(zip(parameters.keys(), params))
        for i in range(number_of_repeats):
            print "\nIteration {} of parameter set {}/{}\nParameters:".format(
                i + 1, counter, len(param_iterator))
            print args
            nao_rl.destroy_instances()
            time.sleep(.5)
            model = PPO(**args)
            model.train()

            date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
            filename = s.MAIN_DIR + '/data/' + parameters['env_name'][
                0] + '_' + date + '.log'
            log = args.copy()
            log['iteration'] = i
            log['exp_number'] = '{}/{}'.format(counter, len(param_iterator))
            log['global_reward'] = model.running_reward
            log['episode_reward'] = model.episode_reward
            log['date'] = date
            log['model_path'] = ''
            data.append(model.running_reward)

            model.close_session()
            del model

            with open(filename, 'w') as logfile:
                logfile.write(json.dumps(log))

    return data
示例#2
0
 def kill(self):
     try:
         nao_rl.destroy_instances()
     except:
         pass
     try:
         self.manager.trainer.env.render(close=True)
     except:
         pass
示例#3
0
        Run the test simulation without any learning algorithm for debugging purposes
        """

        t = 0
        while t < 30:
            self.done = False
            self.reset()
            fps = 30.
            while not self.done:
                # raw_input("Press Enter to continue...")
                action = self.action_space.sample()
                print(action)
                state, reward, self.done, _ = self.step(action)
                print('Current state:\n angles: {}'.format(state))
                print('Reward: {}'.format(reward))
                time.sleep(1 / fps)

            t += 1


if __name__ == "__main__":
    """
    If called as a script this will initialize the scene in an open vrep instance
    """

    # Environment and objects
    import nao_rl
    env = nao_rl.make('NaoBalancing', headless=False)
    env.run()
    nao_rl.destroy_instances()
示例#4
0
    def __init__(self,
                 env_name,
                 render,
                 plot,
                 n_workers=1,
                 max_episodes=10000,
                 episode_length=500,
                 update_every=10,
                 entropy_beta=.005,
                 gamma=.99,
                 actor_layers=[500, 300],
                 critic_layers=[500, 300],
                 actor_lr=.00005,
                 critic_lr=.0001):

        # Training parameters
        self.gamma = gamma
        self.beta = entropy_beta
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.update_every = update_every
        self.n_workers = n_workers
        self.actor_layers = actor_layers
        self.critic_layers = critic_layers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.algorithm = 'a3c'
        self.env_name = env_name
        self.stop = False
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.episode_reward = []
        self.time = None
        self.verbose = True
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Rendering
        if render == 0:
            self.render = [True for _ in range(self.n_workers)]
        if render == 1:
            self.render = [True for _ in range(self.n_workers)]
            self.render[0] = False
        if render == 2:
            self.render = [False for _ in range(self.n_workers)]

        # Plotting
        self.plot = plot
        if self.plot:
            plt.ion()
            plt.figure(1)
            plt.plot()
            plt.xlabel('Episode')
            plt.ylabel('Running reward')
            plt.title('{} episode reward'.format(self.env_name))

        # Session and coordinator
        self.sess = tf.Session()
        self.tf_coordinator = tf.train.Coordinator()
        self.optimizer_actor = tf.train.RMSPropOptimizer(self.actor_lr,
                                                         name='RMSPropA')
        self.optimizer_critic = tf.train.RMSPropOptimizer(self.critic_lr,
                                                          name='RMSPropC')
        self.workers = []

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        try:
            env = nao_rl.make(env_name, headless=True)
        except:
            env = gym.make(env_name)
        self.n_states = env.observation_space.shape[0]
        self.n_actions = env.action_space.shape[0]
        self.action_bounds = [env.action_space.low, env.action_space.high]
        nao_rl.destroy_instances()
        del env

        self.initialize()
示例#5
0
文件: ppo.py 项目: wwchung91/nao_rl
    def __init__(self,
                 env_name='',
                 n_workers=4,
                 max_episodes=5000,
                 episode_length=500,
                 batch_size=128,
                 epochs=10,
                 epsilon=.2,
                 gamma=.99,
                 actor_layers=[500, 500],
                 critic_layers=[500],
                 actor_lr=.00001,
                 critic_lr=.00002):

        # Training parameters
        self.gamma = gamma
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_workers = n_workers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.env_name = env_name
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.time = None
        self.verbose = False

        # Threading and events
        self.update_event, self.rolling_event = threading.Event(
        ), threading.Event()
        self.tf_coordinator = tf.train.Coordinator()
        self.queue = queue.Queue()
        self.sess = tf.Session()

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        env = nao_rl.make(self.env_name, 19998)
        self.action_space = env.action_space.shape[0]
        self.state_space = env.observation_space.shape[0]
        self.action_bounds = [
            env.action_space.low[0], self.action_space.high[0]
        ]
        nao_rl.destroy_instances()
        del env

        ##############
        ### Network ##
        ##############

        # Input placeholders
        self.state_input = tf.placeholder(tf.float32, [None, self.state_space],
                                          ' state_input')
        self.action_input = tf.placeholder(tf.float32,
                                           [None, self.action_space],
                                           'action_input')
        self.advantage_input = tf.placeholder(tf.float32, [None, 1],
                                              'advantage')
        self.discounted_reward = tf.placeholder(tf.float32, [None, 1],
                                                'discounted_reward')

        ########
        # Critic
        hidden_layer = tf.layers.dense(self.state_input, critic_layers[0],
                                       tf.nn.relu)
        for layer_size in critic_layers[1::]:
            hidden_layer = tf.layers.dense(hidden_layer,
                                           critic_layers[layer_size],
                                           tf.nn.relu)
        self.critic_output = tf.layers.dense(hidden_layer, 1)

        self.advantage = self.discounted_reward - self.critic_output
        self.critic_loss = tf.reduce_mean(tf.square(self.advantage))
        self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize(
            self.critic_loss)

        #######
        # Actor
        policy, pi_params = self.build_actor('pi', True, actor_layers)
        old_policy, oldpi_params = self.build_actor('oldpi', False,
                                                    actor_layers)
        self.choose_action = tf.squeeze(policy.sample(1), axis=0)
        self.update_policy = [
            oldpolicy.assign(policy)
            for policy, oldpolicy in zip(pi_params, oldpi_params)
        ]
        ratio = policy.prob(
            self.action_input) / (old_policy.prob(self.action_input) + 1e-5)
        surrogate_loss = ratio * self.advantage_input

        # Clipped objective
        self.actor_loss = -tf.reduce_mean(
            tf.minimum(
                surrogate_loss,
                tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) *
                self.advantage_input))
        self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize(
            self.actor_loss)
        self.sess.run(tf.global_variables_initializer())
示例#6
0
文件: ppo.py 项目: wwchung91/nao_rl
    def __init__(self,
                 env_name,
                 render,
                 plot,
                 n_workers=8,
                 max_episodes=5000,
                 episode_length=500,
                 batch_size=1000,
                 epochs=10,
                 epsilon=.2,
                 gamma=.99,
                 actor_layers=[250, 250],
                 critic_layers=[250],
                 actor_lr=.00001,
                 critic_lr=.00002):

        # Training parameters
        self.gamma = gamma
        self.max_episodes = max_episodes
        self.episode_length = episode_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_workers = n_workers
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Synchronization
        self.algorithm = 'ppo'
        self.env_name = env_name
        self.stop = False
        self.total_steps = 0
        self.update_counter = 0
        self.current_episode = 0
        self.running_reward = []
        self.episode_reward = []
        self.time = time.time()
        self.verbose = True
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Threading and events
        self.sess = tf.Session()
        self.tf_coordinator = tf.train.Coordinator()
        self.queue = queue.Queue()
        self.update_event = threading.Event()
        self.rollout = threading.Event()
        self.workers = []

        # Rendering
        if render == 0:
            self.render = [True for _ in range(self.n_workers)]
        if render == 1:
            self.render = [True for _ in range(self.n_workers)]
            self.render[0] = False
        if render == 2:
            self.render = [False for _ in range(self.n_workers)]

        # Plotting
        self.plot = plot
        if self.plot:
            plt.ion()
            plt.figure(1)
            plt.plot()
            plt.xlabel('Episode')
            plt.ylabel('Running reward')
            plt.title('{} episode reward'.format(self.env_name))

        # Environment parameters
        print "Creating dummy environment to obtain the parameters..."
        try:
            env = nao_rl.make(self.env_name, headless=True)
        except:
            env = gym.make(self.env_name)
        self.n_actions = env.action_space.shape[0]
        self.n_states = env.observation_space.shape[0]
        self.action_bounds = [
            env.action_space.low[0], -env.action_space.low[0]
        ]
        #env.disconnect()
        nao_rl.destroy_instances()
        del env

        ##############
        ### Network ##
        ##############

        # Input placeholders
        self.state_input = tf.placeholder(tf.float32, [None, self.n_states],
                                          'state_input')
        self.action_input = tf.placeholder(tf.float32, [None, self.n_actions],
                                           'action_input')
        self.advantage_input = tf.placeholder(tf.float32, [None, 1],
                                              'advantage')
        self.discounted_reward = tf.placeholder(tf.float32, [None, 1],
                                                'discounted_reward')

        ########
        # Critic
        hidden_layer = tf.layers.dense(self.state_input, critic_layers[0],
                                       tf.nn.relu)
        for layer_size in critic_layers[1::]:
            hidden_layer = tf.layers.dense(hidden_layer, layer_size,
                                           tf.nn.relu)
        self.critic_output = tf.layers.dense(hidden_layer, 1)

        self.advantage = self.discounted_reward - self.critic_output
        self.critic_loss = tf.reduce_mean(tf.square(self.advantage))
        self.critic_optimizer = tf.train.AdamOptimizer(critic_lr).minimize(
            self.critic_loss)

        #######
        # Actor
        policy, pi_params = self.build_actor('policy', True, actor_layers)
        old_policy, oldpi_params = self.build_actor('old_policy', False,
                                                    actor_layers)
        self.choose_action = tf.squeeze(policy.sample(1),
                                        axis=0,
                                        name='choose_action')
        self.update_policy = [
            old.assign(p) for p, old in zip(pi_params, oldpi_params)
        ]
        ratio = policy.prob(
            self.action_input) / (old_policy.prob(self.action_input) + 1e-5)
        surrogate_loss = ratio * self.advantage_input

        # Clipped objective
        self.actor_loss = -tf.reduce_mean(
            tf.minimum(
                surrogate_loss,
                tf.clip_by_value(ratio, 1. - epsilon, 1. + epsilon) *
                self.advantage_input))
        self.actor_optimizer = tf.train.AdamOptimizer(self.actor_lr).minimize(
            self.actor_loss)
        self.sess.run(tf.global_variables_initializer())